In [1]:
from sema import VOC_TopicLabeler, VOC_DataModule
from transformers import AutoTokenizer, AutoModelForMaskedLM, DebertaV2Config, DebertaV2ForSequenceClassification, AutoConfig
import lightning as L

import os 
import numpy as np
import pandas as pd 
import pickle 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import re
from konlpy.tag import Kkma
from tqdm import tqdm
from collections import Counter
kkma = Kkma()

def findall_vec(key,voc):
  try:
    return re.findall(key, voc)[0]
  except:
    return ''

def findall_vec2(df):
  return findall_vec(df['keyword'],df['VOC'])

def filter_etc(df):
  filt0 = (df['VOC'].str.strip().str.len() < 4).astype(int)
  filt1 = df['VOC'].apply(lambda x : bool(re.match(r'^[_\W]+$', str(x).replace(' ','')))).astype(int)
  filt2 = df['VOC'].apply(lambda x : bool(re.match(r'[\d/-]+$', str(x).replace(' ','')))).astype(int)
  filt3 = (df.VOC.str.replace(' ','').str.split('').apply(set).str.len() == 2)
  voc_tok = df.VOC.progress_apply(lambda x : Counter(kkma.morphs(x)))
  filt4 = voc_tok.isin(voc_etc).astype(int)
  return filt0+filt1+filt2+filt3+filt4
  #return filt1,filt2,filt3,filt4

def kkmaCounter(txt):
    try:
        return Counter(kkma.morphs(txt))
    except: 
        return txt

# Function to remove non-English and non-Korean characters
def remove_non_english_korean(string):
    # Regular expression to match English and Korean characters
    pattern = re.compile(r'[^a-zA-Z0-9\uac00-\ud7a3\s]', flags=re.UNICODE)
    return pattern.sub('', string)
    
def strip_e(st):
    RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    return RE_EMOJI.sub(r'', st)

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
N_EPOCHS = 35
BATCH_SIZE = 12
MAX_LEN = 256
LR = 2e-05
opt_thresh=0.5

input_file = pd.Series(os.listdir('./data/input'))
input_file = input_file[input_file.str.contains('.xlsx')]

output_list = pd.Series(os.listdir('./data/output'))
running_file = input_file[~input_file.str[:-5].isin(output_list.str.split('_output').str[0])]

config = AutoConfig.from_pretrained('team-lucid/deberta-v3-xlarge-korean', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('team-lucid/deberta-v3-xlarge-korean')

with open('data2.pkl', 'rb') as f:
    mlb = pickle.load(f)

LABEL_COLUMNS = mlb.classes_[:]
voc_etc = pd.read_pickle('voc_etc.pkl')
keyword = pd.read_pickle('keyword_doc.pkl')

new_model = VOC_TopicLabeler.load_from_checkpoint(checkpoint_path="team-lucid/deberta-v3-xlarge-korean" + "_20ep_full_mar17_dropna.ckpt", n_classes=18, model = 'team-lucid/deberta-v3-xlarge-korean').cuda()
new_model.eval()


In [None]:
mlb.classes_.shape

In [None]:
running_file

In [5]:
for file in running_file:
    #Reading
    print('Reading : ' + file)
    voc_testset = pd.read_excel("data/input/"+file, dtype=str)
    voc = pd.concat([voc_testset.VOC1,voc_testset.VOC2]).sort_index().values
    voc_testset = pd.concat([voc_testset]*2).sort_index().iloc[:,1:-2]
    voc_testset['VOC'] = voc
    voc_testset = voc_testset.dropna(subset='VOC')
    voc_testset.reset_index(inplace=True)
    voc_testset['label'] = pd.DataFrame(np.zeros((18,voc_testset.shape[0])).T).astype(int).apply(list, axis=1)
    
    #Filtering
    print('Filtering : ' + file)
    voc_testset = voc_testset[voc_testset['VOC'] != 'nan']
    voc_testset['VOC'] = voc_testset['VOC'].apply(remove_non_english_korean)
    voc_testset['VOC'] = voc_testset['VOC'].apply(strip_e)
    voc_testset['VOC'] = voc_testset['VOC'].replace(r'\s+', ' ', regex=True)
    
    filt0 = (voc_testset['VOC'].str.strip().str.len() < 4).astype(int)
    filt1 = voc_testset['VOC'].apply(lambda x : bool(re.match(r'^[_\W]+$', str(x).replace(' ','')))).astype(int)
    filt2 = voc_testset['VOC'].apply(lambda x : bool(re.match(r'[\d/-]+$', str(x).replace(' ','')))).astype(int)
    filt3 = (voc_testset.VOC.str.replace(' ','').str.split('').apply(set).str.len() == 2)
    voc_testset = voc_testset[(filt0 + filt1 + filt2 + filt3) == 0]
    # voc_testset['VOC'].progress_apply(lambda x : Counter(kkma.morphs(x)))
    voc_tok = voc_testset['VOC'].progress_apply(lambda x : Counter(kkma.morphs(x)))
    filt4 = voc_tok.isin(voc_etc).astype(int)
    voc_testset = voc_testset[~filt4.astype(bool)].reset_index()
    
    #Setup
    print('Setting : ' + file)
    data_module = VOC_DataModule(voc_testset, voc_testset, tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
    data_module.setup()
    trainer = L.Trainer(max_epochs=N_EPOCHS)
    
    #Inference
    print('Inferencing : ' + file)
    testing_predict = trainer.predict(new_model, datamodule=data_module)
    sema_df_final = np.vstack(testing_predict)
    pred_label = (sema_df_final>opt_thresh).astype(int)
    voc_testset['pred'] = pd.Series(mlb.inverse_transform(pred_label)).apply(list)
    voc_testset = voc_testset.explode('pred',ignore_index=True)
    
    del voc_testset['label']
    
    #키워드
    print('Extracting Keywords : ' + file)
    voc_testset['topic'] = voc_testset.pred.str.split('_').str[0]
    voc_testset['sentiment'] = voc_testset.pred.str.split('_').str[1]
    voc_testset.topic.fillna('기타',inplace=True)
    voc_testset['keyword'] = keyword.loc[voc_testset.topic].values
    voc_testset['keyword'] = voc_testset.apply(findall_vec2, axis=1)
    
    # #save
    print('Saving output File : ' + file)
    voc_testset.to_excel('data/output/' + file[:-5] +'_output.xlsx')
    

Reading : 세종충남대학교병원_VOC_20250616.xlsx
Filtering : 세종충남대학교병원_VOC_20250616.xlsx


100%|██████████| 2087/2087 [02:12<00:00, 15.74it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Setting : 세종충남대학교병원_VOC_20250616.xlsx
Inferencing : 세종충남대학교병원_VOC_20250616.xlsx


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Predicting: |          | 0/? [00:00<?, ?it/s]

Extracting Keywords : 세종충남대학교병원_VOC_20250616.xlsx
Saving output File : 세종충남대학교병원_VOC_20250616.xlsx


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  voc_testset.topic.fillna('기타',inplace=True)


Reading : 충남대학교병원_VOC_input_20250618.xlsx
Filtering : 충남대학교병원_VOC_input_20250618.xlsx


100%|██████████| 3388/3388 [02:53<00:00, 19.52it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Setting : 충남대학교병원_VOC_input_20250618.xlsx
Inferencing : 충남대학교병원_VOC_input_20250618.xlsx


SLURM auto-requeueing enabled. Setting signal handlers.


Predicting: |          | 0/? [00:00<?, ?it/s]

Extracting Keywords : 충남대학교병원_VOC_input_20250618.xlsx
Saving output File : 충남대학교병원_VOC_input_20250618.xlsx


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  voc_testset.topic.fillna('기타',inplace=True)


In [None]:
voc_testset.iloc[:,2:].to_csv('test.csv')

In [7]:
!docker run -d -p 3000:8080 --add-host=host.docker.internal:host-gateway --name open-webui ghcr.io/open-webui/open-webui:main


/bin/bash: line 1: docker: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
import docker

In [9]:
!docker

/bin/bash: line 1: docker: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
