In [1]:
import scipy as sp

# 나이브 베이즈 분류 모형을 이용한 감성 분석 _ 네이버 영화 감상평

In [2]:
%%time
!wget -nc https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!wget -nc https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

File 'ratings_train.txt' already there; not retrieving.

File 'ratings_test.txt' already there; not retrieving.

CPU times: user 6.14 ms, sys: 13.2 ms, total: 19.3 ms
Wall time: 256 ms


In [3]:
#유니코드로 데이터 인코딩
import codecs
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]

In [4]:
#data에서 트레이닝에 사용한 value들만 X,y로 저장 (X : 데이터 내용, y : 평점)
X = list(zip(*data))[1]
y = np.array(list(zip(*data))[2], dtype=int)

In [5]:
# CountVectorizer, TfidfVectorizer활용하여 모델 생성, 성능 비교

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

model1 = Pipeline([
    ('vect', CountVectorizer()),
    ('mb', MultinomialNB())
])

model2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('mb', MultinomialNB())
])

In [6]:
%time
model1.fit(X,y)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [7]:
model2.fit(X,y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [8]:
#모델 성능 검증 - test데이터 불러오기, 유니코드로 인코딩
import codecs
with codecs.open("ratings_test.txt", encoding='utf-8') as f:
    data_test = [line.split('\t') for line in f.read().splitlines()]
    data_test = data_test[1:]

In [9]:
#data_test에서 모델성능검정에 사용할 value들만 X,y로 저장 (X : 데이터 내용, y : 평점)
X_test = list(zip(*data_test))[1]
y_test = np.array(list(zip(*data_test))[2], dtype=int)

In [10]:
#분류성능리포트 - CountVectorizer
print(classification_report(y_test, model1.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.82     25173

avg / total       0.83      0.83      0.83     50000



In [11]:
#분류성능리포트 - TfidfVectorizer
print(classification_report(y_test, model2.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.83     25173

avg / total       0.83      0.83      0.83     50000



In [12]:
# 형태소 분석기 사용, 성능 비교
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [18]:
model3 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos)), 
            ('mb', MultinomialNB()),
        ])
model4 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2))), 
            ('mb', MultinomialNB()),
        ])

In [14]:
%%time
model3.fit(X,y)

CPU times: user 2min 47s, sys: 1.7 s, total: 2min 49s
Wall time: 2min 39s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_pos at 0x118929158>, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [15]:
print(classification_report(y_test, model3.predict(X_test)))

             precision    recall  f1-score   support

          0       0.83      0.85      0.84     24827
          1       0.85      0.83      0.84     25173

avg / total       0.84      0.84      0.84     50000



In [19]:
%%time
model4.fit(X,y)

CPU times: user 2min 44s, sys: 1.55 s, total: 2min 46s
Wall time: 2min 43s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [20]:
print(classification_report(y_test, model4.predict(X_test)))

             precision    recall  f1-score   support

          0       0.86      0.87      0.87     24827
          1       0.87      0.86      0.87     25173

avg / total       0.87      0.87      0.87     50000

