In [None]:
import pandas as pd
from konlpy.tag import Okt
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn.tree import DecisionTreeClassifier

tqdm.pandas()

In [None]:
def tokenizer(text):
    # divide Korean text into morpheme units
    okt = Okt()

    return okt.morphs(text)

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    # get only label column equals 0 or 1
    df = df[df.label != 2]

    text_list = df["sentence"].tolist()
    label_list = df["label"].tolist()

    return text_list, label_list

In [None]:
def split(text_list, label_list):
    text_train, text_test, label_train, label_test = train_test_split(text_list, label_list, test_size=0.2)

    return text_train, text_test, label_train, label_test

In [None]:
def learn(X_train, X_test, y_train, y_test, model):
    # extract text feature with TfidfVectorizer
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    # set linear support vector classifier 
    svm = SVC(kernel="linear", probability=True)
    # set an object containing overall processes
    pipe = Pipeline([("vect", tfidf), ("clf", svm)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # get performance metrics
    print(f"Test acc: \n {accuracy_score(y_test, y_pred):.2f}")
    print(f"CLS report: \n {classification_report(y_test, y_pred)}")

    # save model
    with open(model, "wb") as f:
        pickle.dump(pipe, f)

    print("Model Saved")

In [None]:
def learn_dt(X_train, X_test, y_train, y_test, model):
    # extract text feature with TfidfVectorizer
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    # set an object containing overall processes
    pipe = Pipeline([("vect", tfidf), ("clf", DecisionTreeClassifier())])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # get performance metrics
    print(f"Test acc: \n {accuracy_score(y_test, y_pred):.2f}")
    print(f"CLS report: \n {classification_report(y_test, y_pred)}")

    # save model
    with open(model, "wb") as f:
        pickle.dump(pipe, f)

    print("Model Saved")

In [None]:
def test(model):
    # get saved model
    with open(model, "rb") as f:
        pipe = pickle.load(f)

    while True:
        text = input("Please write report: ")
        if text == "end":
            break

        str = [text]
        # get probability for each class
        r1 = np.max(pipe.predict_proba(str) * 100)
        # get prediction
        r2 = pipe.predict(str)[0]

        print("Acc: ", r1)
        print(r2)

In [None]:
def tonumpy(text_list, label_list):
    # convert list to array
    X_imb, y_imb = np.array(text_list), np.array(label_list)
    X_imb, y_imb = X_imb.reshape((-1, 1)), y_imb.reshape((-1, 1))

    return X_imb, y_imb

In [None]:
def tolist(X_samp):
    # convert array to list
    a = []
    for i in tqdm(X_samp):
        a.append(i[0])

    return a

In [26]:
"""    
"kfold_cv" function 
This is a type of k-fold cross-validation. A single k-fold cross-validation is used with both a validation and test set. 
"""
def kfold_cv(x, y):
    # list2np
    x, y = tonumpy(x, y)
    # Vectorized data
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)

    # K-Fold Cros Validation
    rand = np.random.choice(range(len(y)), len(y), replace=False)
    k1 = (rand[0:int(len(y)/5)])
    k2 = (rand[int(len(y)/5):2*int(len(y)/5)])
    k3 = (rand[2*int(len(y)/5):3*int(len(y)/5)+1])
    k4 = (rand[3*int(len(y)/5)+1:4*int(len(y)/5)+2])
    k5 = (rand[4*int(len(y)/5)+2:5*int(len(y)/5)+2])
    # fold
    folds = [k1, k2, k3, k4, k5]
    for k_idx, knum in enumerate(folds):
        x_test, y_test = x[knum], y[knum]
        x_train, y_train = np.delete(x, np.s_[knum], axis=0), np.delete(y, np.s_[knum], axis=0)
        # set an object containing overall processes
        pipe = Pipeline([("vect", tfidf), ("clf", DecisionTreeClassifier())])
        # np2list
        x_train_lst = tolist(x_train)
        y_train_lst = tolist(y_train)
        x_test_lst  = tolist(x_test)
        # Fit & Predict
        pipe.fit(x_train_lst, y_train_lst)
        y_pred = pipe.predict(x_test_lst)
        # np2list
        y_pred_lst = list(y_pred)
        y_test_lst = list(y_test)
        # Check correction 
        correction = 0
        for i in range(len(y_test_lst)):
            if y_pred_lst[i] == y_test_lst[i]:
                correction+=1  
        print('K-fold{}, Validation Score: {}% '.format(k_idx+1,correction/len(y_test)*100))

In [27]:
def under_sampling(text_list, label_list):
    X_imb, y_imb = tonumpy(text_list, label_list)
    # sampling method that reduces the number of data in a class with a lot of data
    X_samp, y_samp = RandomUnderSampler(random_state=0).fit_resample(X_imb, y_imb)

    return tolist(X_samp), y_samp.tolist()

In [28]:
def over_sampling(text_list, label_list):
    X_imb, y_imb = tonumpy(text_list, label_list)
    # sampling method that increases the number of data of a class with less data
    X_samp, y_samp = RandomOverSampler(random_state=0).fit_resample(X_imb, y_imb)

    return tolist(X_samp), y_samp.tolist()

In [29]:
def train(model, mode: str):
    # return text: list and label: list
    text, label = load_data(path)

    # select under or over sampling
    if mode == "under_sampling":
        text, label = under_sampling(text, label)
    if mode == "over_sampling":
        text, label = over_sampling(text, label)

    # split dataset train and test set
    text_train, text_test, label_train, label_test = split(text, label)
    # learn(text_train, text_test, label_train, label_test, model)
    kfold_cv(text_train, label_train)

In [30]:
def label_csv(model: str, unlabeled_path: str, save_path: str):
    # get unlabeled dataset
    df = pd.read_csv(unlabeled_path)
    # remove missing values
    df = df[df["sentence"].notna()]

    text_list = df["fixed"].tolist()

    # load model pipeline
    with open(model, "rb") as f:
        pipe = pickle.load(f)

    p_label = []
    p_proba = []

    for text in tqdm(text_list):
        str = [text]
        p_proba.append(np.max(pipe.predict_proba(str) * 100))
        p_label.append(pipe.predict(str)[0])

    df["label"] = p_label
    df["probability"] = p_proba

    df.to_csv(save_path, index=False, encoding="utf-8-sig")
    print("Saved pseudo labeled .csv file")

In [31]:
df = pd.read_csv("database/new_split_labeling_1123.csv", index_col = 0)
df.drop(["reviewID"], axis=1, inplace=True)

df[-5:]

In [32]:
# training step with over sampling
# path = "database/new_split_labeling.csv"
# train(model="over.pkl", mode="over_sampling")

In [33]:
# training step with under sampling
path = "database/new_split_labeling_1123.csv"
train(model="under.pkl", mode="under_sampling")

100%|██████████| 15150/15150 [00:00<00:00, 302039.65it/s]
100%|██████████| 9696/9696 [00:00<00:00, 297970.97it/s]
100%|██████████| 9696/9696 [00:00<00:00, 950053.07it/s]
100%|██████████| 2424/2424 [00:00<00:00, 294866.38it/s]


K-fold1, Validation Score: 82.75577557755776% 


100%|██████████| 9696/9696 [00:00<00:00, 216700.53it/s]
100%|██████████| 9696/9696 [00:00<00:00, 1022347.76it/s]
100%|██████████| 2424/2424 [00:00<00:00, 322505.72it/s]


K-fold2, Validation Score: 83.16831683168317% 


100%|██████████| 9695/9695 [00:00<00:00, 326879.24it/s]
100%|██████████| 9695/9695 [00:00<00:00, 859063.64it/s]
100%|██████████| 2425/2425 [00:00<00:00, 190614.45it/s]


K-fold3, Validation Score: 81.15463917525774% 


100%|██████████| 9695/9695 [00:00<00:00, 316336.39it/s]
100%|██████████| 9695/9695 [00:00<00:00, 967724.35it/s]
100%|██████████| 2425/2425 [00:00<00:00, 324036.68it/s]


K-fold4, Validation Score: 83.25773195876288% 


100%|██████████| 9698/9698 [00:00<00:00, 311200.23it/s]
100%|██████████| 9698/9698 [00:00<00:00, 973165.23it/s]
100%|██████████| 2422/2422 [00:00<00:00, 322710.51it/s]


K-fold5, Validation Score: 81.87448389760529% 


In [None]:
# write overall file with label
# label_csv(model="../model/over.pkl", unlabeled_path="../database/tokenized_review_unlabel.csv", save_path="../database/tokenized_review_label.csv")