In [ ]:
# SEMA Model Inference Setup
from sema import VOC_TopicLabeler, VOC_DataModule
from transformers import AutoTokenizer, AutoConfig
import lightning as L
import os, pickle, re
import numpy as np
import pandas as pd 
from konlpy.tag import Kkma
from tqdm import tqdm
from collections import Counter

kkma = Kkma()

def findall_vec(key, voc):
    try:
        return re.findall(key, voc)[0]
    except:
        return ''

def findall_vec2(df):
    return findall_vec(df['keyword'], df['VOC'])

def remove_non_english_korean(string):
    pattern = re.compile(r'[^a-zA-Z0-9\uac00-\ud7a3\s]', flags=re.UNICODE)
    return pattern.sub('', string)
    
def strip_e(st):
    RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    return RE_EMOJI.sub(r'', st)

In [ ]:
# Model Configuration and Loading
BATCH_SIZE = 12
MAX_LEN = 256
opt_thresh = 0.5

# Load files and model components
input_file = pd.Series(os.listdir('./data/input'))
input_file = input_file[input_file.str.contains('.xlsx')]

output_list = pd.Series(os.listdir('./data/output'))
running_file = input_file[~input_file.str[:-5].isin(output_list.str.split('_output').str[0])]

config = AutoConfig.from_pretrained('team-lucid/deberta-v3-xlarge-korean', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('team-lucid/deberta-v3-xlarge-korean')

with open('data/data2.pkl', 'rb') as f:
    mlb = pickle.load(f)

LABEL_COLUMNS = mlb.classes_
voc_etc = pd.read_pickle('data/voc_etc.pkl')
keyword = pd.read_pickle('data/keyword_doc.pkl')

new_model = VOC_TopicLabeler.load_from_checkpoint(
    checkpoint_path="model/deberta-v3-xlarge-korean_20ep_full_mar17_dropna.ckpt", 
    n_classes=18, 
    model='team-lucid/deberta-v3-xlarge-korean'
).cuda()
new_model.eval()

In [None]:
mlb.classes_.shape

In [None]:
running_file

In [ ]:
# Process Files
for file in running_file:
    print(f'Processing: {file}')
    
    # Read and prepare data
    voc_testset = pd.read_excel(f"data/input/{file}", dtype=str)
    voc = pd.concat([voc_testset.VOC1, voc_testset.VOC2]).sort_index().values
    voc_testset = pd.concat([voc_testset]*2).sort_index().iloc[:,1:-2]
    voc_testset['VOC'] = voc
    voc_testset = voc_testset.dropna(subset='VOC').reset_index()
    voc_testset['label'] = pd.DataFrame(np.zeros((18, voc_testset.shape[0])).T).astype(int).apply(list, axis=1)
    
    # Clean and filter data
    voc_testset = voc_testset[voc_testset['VOC'] != 'nan']
    voc_testset['VOC'] = voc_testset['VOC'].apply(remove_non_english_korean).apply(strip_e)
    voc_testset['VOC'] = voc_testset['VOC'].replace(r'\s+', ' ', regex=True)
    
    # Apply filters
    filt0 = (voc_testset['VOC'].str.strip().str.len() < 4).astype(int)
    filt1 = voc_testset['VOC'].apply(lambda x: bool(re.match(r'^[_\W]+$', str(x).replace(' ','')))).astype(int)
    filt2 = voc_testset['VOC'].apply(lambda x: bool(re.match(r'[\d/-]+$', str(x).replace(' ','')))).astype(int)
    filt3 = (voc_testset.VOC.str.replace(' ','').str.split('').apply(set).str.len() == 2)
    voc_testset = voc_testset[(filt0 + filt1 + filt2 + filt3) == 0]
    
    voc_tok = voc_testset['VOC'].progress_apply(lambda x: Counter(kkma.morphs(x)))
    filt4 = voc_tok.isin(voc_etc).astype(int)
    voc_testset = voc_testset[~filt4.astype(bool)].reset_index()
    
    # Run inference
    data_module = VOC_DataModule(voc_testset, voc_testset, tokenizer, batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
    data_module.setup()
    trainer = L.Trainer(max_epochs=35)
    
    testing_predict = trainer.predict(new_model, datamodule=data_module)
    sema_df_final = np.vstack(testing_predict)
    pred_label = (sema_df_final > opt_thresh).astype(int)
    voc_testset['pred'] = pd.Series(mlb.inverse_transform(pred_label)).apply(list)
    voc_testset = voc_testset.explode('pred', ignore_index=True)
    
    del voc_testset['label']
    
    # Extract keywords and topics
    voc_testset['topic'] = voc_testset.pred.str.split('_').str[0]
    voc_testset['sentiment'] = voc_testset.pred.str.split('_').str[1]
    voc_testset.topic = voc_testset.topic.fillna('기타')
    voc_testset['keyword'] = keyword.loc[voc_testset.topic].values
    voc_testset['keyword'] = voc_testset.apply(findall_vec2, axis=1)
    
    # Save output
    voc_testset.to_excel(f'data/output/{file[:-5]}_output.xlsx')
    print(f'Completed: {file}')