In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [46]:
posts_csv = './posts.csv'

data = pd.read_csv(posts_csv)

In [47]:
X = data['text']
y = data['advertisement']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print(X_train.shape, X_test.shape)

(232,) (59,)


## CountVectorizer + LogisticRegression

In [48]:
count_vector = CountVectorizer()
count_vector.fit(X_train)

X_train_count_vector = count_vector.transform(X_train)
X_test_count_vector = count_vector.transform(X_test)

In [49]:
logistic_regression = LogisticRegression(C=1)
logistic_regression.fit(X_train_count_vector, y_train)

In [50]:
prediction = logistic_regression.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Lostistic Regression")
print("test set 정확도 :", accuracy)

Lostistic Regression
test set 정확도 : 0.7796610169491526


In [51]:
scores = cross_val_score(logistic_regression, X_train_count_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.7024051803885291
cross-validation 최대 정확도 : 0.7659574468085106


In [52]:
params = {
    "C" : [0.001, 0.01, 0.1, 1, 2, 4, 8, 16, 32, 64, 128]
}

gscv_logistic_regression = GridSearchCV (estimator=logistic_regression, param_grid=params, scoring='accuracy', cv=5)
gscv_logistic_regression.fit(X_train_count_vector, y_train)
print("Grid Search CV 정확도 :", gscv_logistic_regression.best_score_)
print("Grid Search CV 최적 매개변수 :", gscv_logistic_regression.best_estimator_)

Grid Search CV 정확도 : 0.7066604995374653
Grid Search CV 최적 매개변수 : LogisticRegression(C=2)


## CountVector + SVM

In [53]:
svc = SVC(kernel='sigmoid')
svc.fit(X_train_count_vector, y_train)

In [54]:
prediction = svc.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Support Vector Machine 정확도 :", accuracy)

Support Vector Machine 정확도 : 0.7457627118644068


In [55]:
scores = cross_val_score(svc, X_train_count_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.5986123959296947
cross-validation 최대 정확도 : 0.723404255319149


## TF-IDF Vectorizer + SVM

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2))
tfidf_vectorizer.fit(X_train)

X_train_tfidf_vector = tfidf_vectorizer.transform(X_train)
X_test_tfidf_vector = tfidf_vectorizer.transform(X_test)

In [57]:
tfidf_svc = SVC(kernel='sigmoid')
tfidf_svc.fit(X_train_tfidf_vector, y_train)

In [58]:
prediction = tfidf_svc.predict(X_test_tfidf_vector)
accuracy = accuracy_score(y_test, prediction)
print("TF-IDF Support Vector Machine 정확도 :", accuracy)

TF-IDF Support Vector Machine 정확도 : 0.6440677966101694


In [59]:
scores = cross_val_score(tfidf_svc, X_train_count_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.5986123959296947
cross-validation 최대 정확도 : 0.723404255319149


## Korean-Tokenizer + SVM

In [60]:
from konlpy.tag import Hannanum

hannanum = Hannanum()

def ko_tokenizer(text):
    return hannanum.morphs(text)

ko_count_vector = CountVectorizer(tokenizer=ko_tokenizer, max_df=20)
ko_count_vector.fit(X_train)



In [61]:
X_train_ko_vector = ko_count_vector.transform(X_train)
X_test_ko_vector = ko_count_vector.transform(X_test)

In [62]:
ko_svc = SVC(kernel='sigmoid')
ko_svc.fit(X_train_ko_vector, y_train)

In [63]:
prediction = ko_svc.predict(X_test_ko_vector)
accuracy = accuracy_score(y_test, prediction)
print("Ko-Tokenizer Support Vector Machine 정확도 :", accuracy)

Ko-Tokenizer Support Vector Machine 정확도 : 0.6949152542372882


In [64]:
scores = cross_val_score(ko_svc, X_train_ko_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.6076780758556891
cross-validation 최대 정확도 : 0.6521739130434783


In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

ko_tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), tokenizer=ko_tokenizer, max_df=20)
ko_tfidf_vectorizer.fit(X_train)

X_train_ko_tfidf_vector = tfidf_vectorizer.transform(X_train)
X_test_ko_tfidf_vector = tfidf_vectorizer.transform(X_test)



In [66]:
ko_svc.fit(X_train_ko_tfidf_vector, y_train)
prediction = ko_svc.predict(X_test_ko_tfidf_vector)
accuracy = accuracy_score(y_test, prediction)
print("TF-IDF Ko-Tokenizer Support Vector Machine 정확도 :", accuracy)

TF-IDF Ko-Tokenizer Support Vector Machine 정확도 : 0.6440677966101694


In [67]:
scores = cross_val_score(ko_svc, X_train_ko_tfidf_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.6506012950971323
cross-validation 최대 정확도 : 0.6808510638297872


In [68]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [69]:
decision_tree_classifier = DecisionTreeClassifier(max_depth=10, criterion='entropy')
ada_boost_classifier = AdaBoostClassifier(estimator=decision_tree_classifier, n_estimators=50)
ada_boost_classifier.fit(X_train_count_vector, y_train)

In [70]:
prediction = ada_boost_classifier.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Decision Tree 기반의 Ada Boosting 정확도 :", accuracy)

Decision Tree 기반의 Ada Boosting 정확도 : 0.6779661016949152


In [71]:
# import matplotlib.pyplot as plt
# from sklearn.tree import plot_tree
# import koreanize_matplotlib
# 
# %config InlineBackend.figure_format = 'retina'
# 
# # 트리 시각화
# n_estimator = len(ada_boost_classifier.estimators_)
# 
# feature_names_list = count_vector.get_feature_names_out().tolist()
# 
# for i in range(n_estimator):
#     fig = plt.figure(figsize=(10, 15), facecolor='white')
#     ax = fig.add_subplot(111)
#     plot_tree(ada_boost_classifier.estimators_[i],
#               feature_names=feature_names_list,
#               class_names=[str(cls) for cls in ada_boost_classifier.classes_],
#               ax=ax,
#               filled=True,
#               rounded=True)
#     ax.set_title(f'{i+1} th tree')
#     
#     plt.savefig(f'graph{i+1}.pdf', dpi=1200, format='pdf', bbox_inches='tight')
#     plt.show()
