In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    # To read the second and third column info from each row
    texts, labels = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [4]:
print('전체 영화평의 수: {}'.format(len(labels)))
print('긍정 영화평의 수: {}'.format(sum(labels)))
print('긍정 영화평의 비율: {0:.2f}'.format(sum(labels)/len(labels)))

전체 영화평의 수: 165384
긍정 영화평의 수: 86806
긍정 영화평의 비율: 0.52


In [5]:
# To split the data into training and test datasets
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=0)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
tf_vectorizer = CountVectorizer() 
tf_train_features = tf_vectorizer.fit_transform(train_texts) 
tf_test_features = tf_vectorizer.transform(test_texts)

In [8]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
mnb_clf = MultinomialNB()

In [20]:
from sklearn.model_selection import GridSearchCV
MNB_params = {
    'alpha': [0.1, 0.5, 1, 2, 5]
}

In [38]:
grid_search = GridSearchCV(mnb_clf, param_grid=MNB_params, cv=5)

In [39]:
grid_search.fit(tf_train_features, train_labels)

GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid={'alpha': [0.1, 0.5, 1, 2, 5]})

In [40]:
grid_search.cv_results_['mean_test_score']

array([0.88075515, 0.88375827, 0.8841345 , 0.88388592, 0.88162854])

In [41]:
grid_search.best_params_

{'alpha': 1}

In [42]:
pred_labels = grid_search.best_estimator_.predict(tf_test_features)

In [43]:
from sklearn.metrics import classification_report

In [44]:
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88      7766
           1       0.90      0.89      0.89      8773

    accuracy                           0.89     16539
   macro avg       0.89      0.89      0.89     16539
weighted avg       0.89      0.89      0.89     16539



TF-IDF 정보를 이용하는 경우

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_features = tfidf_vectorizer.fit_transform(train_texts) 
tfidf_test_features = tfidf_vectorizer.transform(test_texts)

In [15]:
mnb_clf1 = MultinomialNB(alpha=1.0)

In [16]:
mnb_clf1.fit(tfidf_train_features, train_labels)

MultinomialNB()

In [17]:
pred_labels1 = mnb_clf1.predict(tfidf_test_features)

In [18]:
print(classification_report(test_labels, pred_labels1))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      7766
           1       0.89      0.89      0.89      8773

    accuracy                           0.88     16539
   macro avg       0.88      0.88      0.88     16539
weighted avg       0.88      0.88      0.88     16539

