In [1]:
import pandas as pd
from konlpy.tag import Okt
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import numpy as np

tqdm.pandas()

In [2]:
def tokenizer(text):
    # divide Korean text into morpheme units
    okt = Okt()

    return okt.morphs(text)

In [26]:
def load_data(path):
    df = pd.read_csv(path)
    # get only label column equals 0 or 1
    df = df[df.label != 2]
    df = df[df.label != 11]

    text_list = df["sentence"].tolist()
    label_list = df["label"].tolist()

    return text_list, label_list

In [27]:
def split(text_list, label_list):
    text_train, text_test, label_train, label_test = train_test_split(text_list, label_list, test_size=0.2)

    return text_train, text_test, label_train, label_test

In [28]:
def learn(X_train, X_test, y_train, y_test, model):
    # extract text feature with TfidfVectorizer
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    # set linear support vector classifier
    svm = SVC(kernel="linear", probability=True)
    # set an object containing overall processes
    pipe = Pipeline([("vect", tfidf), ("clf", svm)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # get performance metrics
    print(f"Test acc: \n {accuracy_score(y_test, y_pred):.2f}")
    print(f"CLS report: \n {classification_report(y_test, y_pred)}")

    # save model
    with open(model, "wb") as f:
        pickle.dump(pipe, f)

    print("Model Saved")

In [29]:
def test(model):
    # get saved model
    with open(model, "rb") as f:
        pipe = pickle.load(f)

    while True:
        text = input("Please write report: ")
        if text == "end":
            break

        str = [text]
        # get probability for each class
        r1 = np.max(pipe.predict_proba(str) * 100)
        # get prediction
        r2 = pipe.predict(str)[0]

        print("Acc: ", r1)
        print(r2)

In [30]:
def tonumpy(text_list, label_list):
    # convert list to array
    X_imb, y_imb = np.array(text_list), np.array(label_list)
    X_imb, y_imb = X_imb.reshape((-1, 1)), y_imb.reshape((-1, 1))

    return X_imb, y_imb

In [31]:
def tolist(X_samp):
    # convert array to list
    a = []
    for i in tqdm(X_samp):
        a.append(i[0])

    return a

In [32]:
def under_sampling(text_list, label_list):
    X_imb, y_imb = tonumpy(text_list, label_list)
    # sampling method that reduces the number of data in a class with a lot of data
    X_samp, y_samp = RandomUnderSampler(random_state=0).fit_resample(X_imb, y_imb)

    return tolist(X_samp), y_samp.tolist()

In [33]:
def over_sampling(text_list, label_list):
    X_imb, y_imb = tonumpy(text_list, label_list)
    # sampling method that increases the number of data of a class with less data
    X_samp, y_samp = RandomOverSampler(random_state=0).fit_resample(X_imb, y_imb)

    return tolist(X_samp), y_samp.tolist()

In [34]:
def train(path, model, mode: str):
    # return text: list and label: list
    text, label = load_data(path)

    # select under or over sampling
    if mode == "under_sampling":
        text, label = under_sampling(text, label)
    if mode == "over_sampling":
        text, label = over_sampling(text, label)

    # split dataset train and test set
    text_train, text_test, label_train, label_test = split(text, label)
    learn(text_train, text_test, label_train, label_test, model)

In [49]:
def label_csv(model: str, unlabeled_path: str, save_path: str):
    # get unlabeled dataset
    df = pd.read_csv(unlabeled_path)
    # remove missing values
    df = df[df["sentence"].notna()]
    df = df[df["fixed"].notna()]

    # load model pipeline
    with open(model, "rb") as f:
        pipe = pickle.load(f)

    p_label = []
    p_proba = []

    text_list = df["fixed"].tolist()
    for text in tqdm(text_list):
        if text is not None:
            str = [text]
        p_proba.append(np.max(pipe.predict_proba(str) * 100))
        p_label.append(pipe.predict(str)[0])

    df["label"] = p_label
    df["probability"] = p_proba

    df.to_csv(save_path, index=False, encoding="utf-8")
    print("Saved pseudo labeled .csv file")

In [37]:
df = pd.read_csv("../database/review_3_handlabeled.csv", index_col=None)
df[:5]

Unnamed: 0,label,sentence
0,0,창문 구조 자체가 방음이 잘 되지 않고 찬바람이 들어오는 것 같아요.
1,0,지금 새벽 네시가 다 되어 가는데 밖에 경찰차 소리가 엄청나게 크게 들리네요?.
2,0,난방도 기능은 존재하는데 되는지를 모르겠네요.
3,0,별로 춥지 않아서 컴플레인은 걸지 않았지만 겨울철 되면 문제가 커질 것 같고요.
4,0,그리고 화장실 문이 심하게 덜 렁거리고 콘센트가 거의 없어서 매우 불편했습니다.


In [38]:
# training step with over sampling
train(path="../database/review_3_handlabeled.csv", model="over.pkl", mode="over_sampling")

100%|██████████| 64674/64674 [00:00<00:00, 878031.02it/s]


Test acc: 
 0.96
CLS report: 
               precision    recall  f1-score   support

           0       0.94      0.99      0.96      6491
           1       0.98      0.94      0.96      6444

    accuracy                           0.96     12935
   macro avg       0.96      0.96      0.96     12935
weighted avg       0.96      0.96      0.96     12935

Model Saved


In [39]:
# training step with under sampling
train(path="../database/review_3_handlabeled.csv", model="under.pkl", mode="under_sampling")

100%|██████████| 15150/15150 [00:00<00:00, 802836.49it/s]


Test acc: 
 0.93
CLS report: 
               precision    recall  f1-score   support

           0       0.91      0.96      0.93      1502
           1       0.96      0.90      0.93      1528

    accuracy                           0.93      3030
   macro avg       0.93      0.93      0.93      3030
weighted avg       0.93      0.93      0.93      3030

Model Saved


In [50]:
# write overall file with label
label_csv(model="../model/over.pkl", unlabeled_path="../database/review_3_naver.csv", save_path="../database/review_pseudolabeled.csv")

100%|██████████| 154031/154031 [19:45<00:00, 129.90it/s]


Saved pseudo labeled .csv file
