In [34]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import codecs

from konlpy.utils import pprint
from konlpy.tag import *
from konlpy.tag import Twitter, Komoran

### 1. Read Data

In [35]:
def read_data(filename):
    with codecs.open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] # remove header
    return data

train_data = read_data('./data/ratings_train.txt')
test_data = read_data('./data/ratings_test.txt')

# train data
t1, t2, t3 = zip(*train_data) # python3 zip function() - return tuple, python2 zip function() - return list
X_train = t2
y_train = np.array(t3, dtype=int) # chage type string to integer

# test data
t1, t2, t3 = zip(*test_data) # python3 zip function() - return tuple, python2 zip function() - return list
X_test = t2
y_test = np.array(t3, dtype=int) # chage type string to integer

# y data - 0 : negative, 1: positive

### 2. Set Tarining Data, Test Data

In [36]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, train_size=10000, test_size=100)

### 3. Make Model

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [38]:
pos_tagger = Twitter()

def tokenize_pos(doc):
    result = ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
    return result

clf_1 = Pipeline([
            ('vect', CountVectorizer()), 
            ('clf', MultinomialNB()),
        ])
clf_2 = Pipeline([
            ('vect', TfidfVectorizer()), 
            ('clf', MultinomialNB()),
        ])
clf_3 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2))), 
            ('clf', MultinomialNB()),
        ])

In [39]:
model1 = clf_1.fit(X_train, y_train)
model2 = clf_2.fit(X_train, y_train)
model3 = clf_3.fit(X_train, y_train)

### 4. Report Result

In [40]:
print(classification_report(y_test, model1.predict(X_test)))
print(classification_report(y_test, model2.predict(X_test)))
print(classification_report(y_test, model3.predict(X_test)))

             precision    recall  f1-score   support

          0       0.73      0.73      0.73        55
          1       0.67      0.67      0.67        45

avg / total       0.70      0.70      0.70       100

             precision    recall  f1-score   support

          0       0.74      0.71      0.72        55
          1       0.66      0.69      0.67        45

avg / total       0.70      0.70      0.70       100

             precision    recall  f1-score   support

          0       0.83      0.82      0.83        55
          1       0.78      0.80      0.79        45

avg / total       0.81      0.81      0.81       100

