In [118]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings(action='ignore')

In [119]:
posts_tsv = '../search_item_tb.tsv'

data = pd.read_csv(posts_tsv, encoding='utf-8', sep='\t')

In [120]:
X = data['content']
y = data['advertisement']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

print(X_train.shape, X_test.shape)

(496,) (213,)


In [121]:
import re


# 불필요한 단어 제거
def clean_str(text):
    # 영어 대소문자, 숫자, 한글을 제외한 모든 문자 제거
    text = re.sub('[^A-Za-z0-9가-힣]', '', text)
    # 특정 기호들 제거
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', text)
    # 개행 문자 제거
    text = re.sub('\n', '', text)
    
    return text

# 맨 뒤의 500자만 학습 진행
def text_slice(text):
    return text[-200:]


# preprocessing - cleaning
X_train = X_train.apply(str).apply(clean_str)
X_test = X_test.apply(str).apply(clean_str)

X_train = X_train.apply(str).apply(text_slice)
X_test = X_test.apply(str).apply(text_slice)

## CountVectorizer + LogisticRegression

In [122]:
from konlpy.tag import Mecab


mecab = Mecab()

def mecab_tokenizer(text):
    return mecab.morphs(text)

count_vector = CountVectorizer(tokenizer=mecab_tokenizer)
count_vector.fit(X_train)

X_train_count_vector = count_vector.transform(X_train)
X_test_count_vector = count_vector.transform(X_test)

In [123]:
logistic_regression = LogisticRegression(C=4)
logistic_regression.fit(X_train_count_vector, y_train)

In [124]:
prediction = logistic_regression.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Lostistic Regression")
print("test set 정확도 :", accuracy)

Lostistic Regression
test set 정확도 : 0.7511737089201878


In [125]:
scores = cross_val_score(logistic_regression, X_train_count_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))

cross-validation 평균 정확도 : 0.6975353535353535


In [126]:
params = {
    'C' : [0.001, 0.01, 0.1, 1, 2, 4, 8, 16, 32, 64, 128],
}

gscv_logistic_regression = GridSearchCV(estimator=logistic_regression, param_grid=params, scoring='accuracy', cv=5)
gscv_logistic_regression.fit(X_train_count_vector, y_train)
print("Grid Search CV 정확도 :", gscv_logistic_regression.best_score_)
print("Grid Search CV 최적 매개변수 :", gscv_logistic_regression.best_estimator_)

Grid Search CV 정확도 : 0.6975353535353535
Grid Search CV 최적 매개변수 : LogisticRegression(C=4)


In [127]:
# 모델 저장하기
joblib.dump(logistic_regression, './logistic_regression.pkl')
joblib.dump(count_vector, './count_vectorizer.pkl')

['./count_vectorizer.pkl']