In [None]:
import pandas as pd
from konlpy.tag import Okt
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

tqdm.pandas()

In [None]:
def tokenizer(text):
    okt = Okt()
    return okt.morphs(text)

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    df = df[df.label != 2]

    text_list = df['sentence'].tolist()
    label_list = df['label'].tolist()

    return text_list, label_list

In [None]:
def split(text_list, label_list):
    text_train, text_test, label_train, label_test = train_test_split(text_list, label_list, test_size=0.2, random_state=42)

    return text_train, text_test, label_train, label_test

In [None]:
def learn(X_train, X_test, y_train, y_test, model):
    # 데이터 단어사전 -> 백터화
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenizer)
    svm = LinearSVC()
    pipe = Pipeline([('vect', tfidf), ('clf', svm)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    with open(model, 'wb') as fp:
        pickle.dump(pipe, fp)

    print('Model Saved')

In [None]:
def test(model):
    with open(model, 'rb') as fp:
        pipe = pickle.load(fp)

    while True:
      text = input('리뷰를 작성해주세요: ')

      if text == 'end':
          break

      str = [text]
      r1 = np.max(pipe.predict_proba(str)*100)
      r2 = pipe.predict(str)[0]

      print('acc =', r1)
      print(r2)

In [None]:
def tonumpy(text_list, label_list):
    X_imb, y_imb = np.array(text_list), np.array(label_list)
    X_imb, y_imb = X_imb.reshape((-1, 1)), y_imb.reshape((-1, 1))

    return X_imb, y_imb

In [None]:
def toList(X_samp):
    a = []
    for i in tqdm(X_samp):
        a.append(i[0])

    return a

In [None]:
def under_sampling(text_list, label_list):
    X_imb, y_imb = tonumpy(text_list, label_list)
    X_samp, y_samp = RandomUnderSampler(random_state=0).fit_resample(X_imb, y_imb)

    return toList(X_samp), y_samp.tolist()

In [None]:
def over_sampling(text_list, label_list):
    X_imb, y_imb = tonumpy(text_list, label_list)
    X_samp, y_samp = RandomOverSampler(random_state=0).fit_resample(X_imb, y_imb)

    return toList(X_samp), y_samp.tolist()

In [None]:
def train(model, mode):
  t, l = load_data(path)
  if mode == -1:
    t, l = under_sampling(t, l)
  if mode == 1:
    t, l = over_sampling(t, l)
  text_train, text_test, label_train, label_test = split(t, l)
  learn(text_train, text_test, label_train, label_test, model)

In [None]:
def label_csv(model, c_path):
    df = pd.read_csv(path)
    df = df[df['sentence'].notna()]

    text_list = df['fixed'].tolist()

    with open(model, 'rb') as fp:
        pipe = pickle.load(fp)

    import numpy as np

    p_label = []
    p_proba = []

    for text in tqdm(text_list):
        str = [text]
        p_proba.append(np.max(pipe.predict_proba(str) * 100))
        p_label.append(pipe.predict(str)[0])

    df['label'] = p_label
    df['probability'] = p_proba

    df.to_csv('spell_check_label.csv')
    print('csv saved')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/spell_check_label.csv")

In [None]:
path = "/content/drive/MyDrive/spell_check.csv"
train("none_ori", 0)

0.9399974946761869
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1485
           1       0.96      0.97      0.96      6498

    accuracy                           0.94      7983
   macro avg       0.90      0.90      0.90      7983
weighted avg       0.94      0.94      0.94      7983

Model Saved


In [None]:
path = "/content/drive/MyDrive/spell_check_unlabel.csv"
train("oversamp.dat", 1)

# 파일 레이블
c_path = "/content/drive/MyDrive/spell_check_unlabel.csv"
label_csv("/content/drive/MyDrive/models/over.dat", c_path)

csv saved
