In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
posts_csv = './posts.csv'

data = pd.read_csv(posts_csv)

In [3]:
X = data['text']
y = data['advertisement']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print(X_train.shape, X_test.shape)

(232,) (59,)


## CountVectorizer + LogisticRegression

In [4]:
count_vector = CountVectorizer()
count_vector.fit(X_train)

X_train_count_vector = count_vector.transform(X_train)
X_test_count_vector = count_vector.transform(X_test)

In [5]:
logistic_regression = LogisticRegression(C=2)
logistic_regression.fit(X_train_count_vector, y_train)

In [6]:
prediction = logistic_regression.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Lostistic Regression")
print("test set 정확도 :", accuracy)

Lostistic Regression
test set 정확도 : 0.7966101694915254


In [7]:
scores = cross_val_score(logistic_regression, X_train_count_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.7066604995374653
cross-validation 최대 정확도 : 0.7872340425531915


In [8]:
params = {
    "C" : [0.001, 0.01, 0.1, 1, 2, 4, 8, 16, 32, 64, 128]
}

gscv_logistic_regression = GridSearchCV (estimator=logistic_regression, param_grid=params, scoring='accuracy', cv=5)
gscv_logistic_regression.fit(X_train_count_vector, y_train)
print("Grid Search CV 정확도 :", gscv_logistic_regression.best_score_)
print("Grid Search CV 최적 매개변수 :", gscv_logistic_regression.best_estimator_)

Grid Search CV 정확도 : 0.7066604995374653
Grid Search CV 최적 매개변수 : LogisticRegression(C=2)


## CountVector + SVM

In [9]:
svc = SVC(kernel='sigmoid')
svc.fit(X_train_count_vector, y_train)

In [10]:
prediction = svc.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Support Vector Machine 정확도 :", accuracy)

Support Vector Machine 정확도 : 0.7457627118644068


In [11]:
scores = cross_val_score(svc, X_train_count_vector, y_train, cv=5)
print("cross-validation 평균 정확도 :", np.mean(scores))
print("cross-validation 최대 정확도 :", np.max(scores))

cross-validation 평균 정확도 : 0.5986123959296947
cross-validation 최대 정확도 : 0.723404255319149


## CountVectorizer + DecisionTree based AdaBoostClassifier

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [13]:
decision_tree_classifier = DecisionTreeClassifier(max_depth=10, criterion='entropy')
ada_boost_classifier = AdaBoostClassifier(estimator=decision_tree_classifier, n_estimators=50)
ada_boost_classifier.fit(X_train_count_vector, y_train)

In [14]:
prediction = ada_boost_classifier.predict(X_test_count_vector)
accuracy = accuracy_score(y_test, prediction)
print("Decision Tree 기반의 Ada Boosting 정확도 :", accuracy)

Decision Tree 기반의 Ada Boosting 정확도 : 0.6610169491525424


In [15]:
# import matplotlib.pyplot as plt
# from sklearn.tree import plot_tree
# import koreanize_matplotlib
# 
# %config InlineBackend.figure_format = 'retina'
# 
# # 트리 시각화
# n_estimator = len(ada_boost_classifier.estimators_)
# 
# feature_names_list = count_vector.get_feature_names_out().tolist()
# 
# for i in range(n_estimator):
#     fig = plt.figure(figsize=(10, 15), facecolor='white')
#     ax = fig.add_subplot(111)
#     plot_tree(ada_boost_classifier.estimators_[i],
#               feature_names=feature_names_list,
#               class_names=[str(cls) for cls in ada_boost_classifier.classes_],
#               ax=ax,
#               filled=True,
#               rounded=True)
#     ax.set_title(f'{i+1} th tree')
#     
#     plt.savefig(f'graph{i+1}.pdf', dpi=1200, format='pdf', bbox_inches='tight')
#     plt.show()
