In [4]:
import pandas as pd
import numpy as np

In [5]:
with open('Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    texts, labels = zip(*docs) # 둘을 분리해서 별도의 list 변수로 저장

In [6]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=0)

In [7]:
# CounterVectorizer 클래스를 이용한 벡터 표현
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer() 
tf_train_features = tf_vectorizer.fit_transform(train_texts) 
tf_test_features = tf_vectorizer.transform(test_texts)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

In [9]:
gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=300, random_state=2, learning_rate=1.0)

In [10]:
gb_clf.fit(tf_train_features, train_labels)

GradientBoostingClassifier(learning_rate=1.0, max_depth=2, n_estimators=300,
                           random_state=2)

In [11]:
pred_labels_tf = gb_clf.predict(tf_test_features)

In [12]:
from sklearn.metrics import classification_report

In [13]:
print(classification_report(test_labels, pred_labels_tf))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83      7766
           1       0.83      0.90      0.86      8773

    accuracy                           0.85     16539
   macro avg       0.85      0.84      0.85     16539
weighted avg       0.85      0.85      0.85     16539



In [15]:
from sklearn.metrics import f1_score
f1_score(test_labels, pred_labels_tf, average='macro')

0.846026773404942

n_estimators 값 변경해 보기

In [20]:
n_estimators_values = [10, 50, 100, 300, 500, 700, 1000]
for value in n_estimators_values:
    gbr = GradientBoostingClassifier(max_depth=2, n_estimators=value, random_state=2, learning_rate=1.0)
    gbr.fit(tf_train_features, train_labels)
    y_preds = gbr.predict(tf_test_features)
    f1 = f1_score(test_labels, y_preds, average='macro')
    print('Number of estimators:', value, ', Score:', f1)

Number of estimators: 10 , Score: 0.6784392545754462
Number of estimators: 50 , Score: 0.7820051717989631
Number of estimators: 100 , Score: 0.811061913512709
Number of estimators: 300 , Score: 0.846026773404942
Number of estimators: 500 , Score: 0.8589618955476586
Number of estimators: 700 , Score: 0.8657011556835132
Number of estimators: 1000 , Score: 0.8688241194847548


In [19]:
learning_rate_values = [0.01, 0.1, 0.3, 0.5, 1.0]
for value in learning_rate_values:
    gbr = GradientBoostingClassifier(max_depth=2, n_estimators=500, random_state=2, learning_rate=value)
    gbr.fit(tf_train_features, train_labels)
    y_preds = gbr.predict(tf_test_features)
    f1 = f1_score(test_labels, y_preds, average='macro')
    print('Learning rate:', value, ', Score:', f1)

Learning rate: 0.01 , Score: 0.6906814272563049
Learning rate: 0.1 , Score: 0.8010863578544098
Learning rate: 0.3 , Score: 0.8390737037497261
Learning rate: 0.5 , Score: 0.853282905191383
Learning rate: 1.0 , Score: 0.8589618955476586


In [21]:
subsample_values = [0.1, 0.3, 0.5, 1]
for value in subsample_values:
    gbr = GradientBoostingClassifier(max_depth=2, n_estimators=500, random_state=2, 
                                     learning_rate=1.0, subsample=value)
    gbr.fit(tf_train_features, train_labels)
    y_preds = gbr.predict(tf_test_features)
    f1 = f1_score(test_labels, y_preds, average='macro')
    print('Subsample_ratio:', value, ', Score:', f1)

Learning rate: 0.1 , Score: 0.8190527603070623
Learning rate: 0.3 , Score: 0.8423571326394537
Learning rate: 0.5 , Score: 0.8550180840632975
Learning rate: 1 , Score: 0.8589618955476586


## TF-IDF 기반 DTM 사용해 보기

In [20]:
# TfidfVectorizer 클래스를 이용한 벡터 표현
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_features = tfidf_vectorizer.fit_transform(train_texts) 
tfidf_test_features = tfidf_vectorizer.transform(test_texts)

In [21]:
dt_clf_tfidf = DecisionTreeClassifier()

In [22]:
dt_clf_tfidf.fit(tfidf_train_features, train_labels)

DecisionTreeClassifier()

In [23]:
pred_labels_tfidf = dt_clf_tfidf.predict(tfidf_test_features)

In [24]:
print(classification_report(test_labels, pred_labels_tfidf))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      7766
           1       0.82      0.83      0.83      8773

    accuracy                           0.81     16539
   macro avg       0.81      0.81      0.81     16539
weighted avg       0.81      0.81      0.81     16539

