In [1]:
import re
import os
import time
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import numpy as np
import konlpy.tag
import nltk

from konlpy.tag import Komoran
from konlpy.tag import Twitter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('merged_comments.csv')

In [3]:
scores = tuple(df.iloc[:,3])
texts = tuple(df.iloc[:,4])

In [4]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 <= score <= 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, -1
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 8 else 0)

In [5]:
# train test split
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)

In [6]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [7]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)

In [8]:
train_tfidf_features.shape, test_tfidf_features.shape

((396912, 574866), (170106, 574866))

In [9]:
len(train_labels), len(test_labels)

(396912, 170106)

In [15]:
%%time
lr = LogisticRegression(C=0.1, penalty='l1', random_state=0) # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
lr_pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((lr_pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, lr_pred_labels))



Misclassified samples: 24484 out of 170106
Accuracy: 0.86
CPU times: user 1.57 s, sys: 36.1 ms, total: 1.61 s
Wall time: 1.6 s


In [16]:
%%time
MNB=MultinomialNB()
MNB.fit(train_tfidf_features, train_labels) 
nb_pred_labels = MNB.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((nb_pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, nb_pred_labels))

Misclassified samples: 24123 out of 170106
Accuracy: 0.86
CPU times: user 167 ms, sys: 3.94 ms, total: 171 ms
Wall time: 169 ms


In [17]:
%%time
sgd=SGDClassifier()
sgd.fit(train_tfidf_features, train_labels) 
sgd_pred_labels = sgd.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((sgd_pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, sgd_pred_labels))



Misclassified samples: 29723 out of 170106
Accuracy: 0.83
CPU times: user 1.7 s, sys: 11 ms, total: 1.71 s
Wall time: 549 ms


In [None]:
'''
%%time
knn=KMeans(n_clusters=2)
knn.fit(train_tfidf_features, train_labels) 
pred_labels = knn.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))
'''

In [None]:
## 2h 43m accuracy 0.86
'''
%%time
from sklearn.ensemble import RandomForestClassifier
RFT = RandomForestClassifier(n_estimators =100)
RFT.fit(train_tfidf_features, train_labels) 
pred_labels = RFT.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))
'''

In [None]:
'''
%%time
from sklearn.svm import SVC
sinvecma = SVC()
sinvecma.fit(train_tfidf_features, train_labels) 
pred_labels = sinvecma.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))
'''

In [18]:
ddf = pd.DataFrame([test_texts,list(nb_pred_labels),test_labels]).transpose()

In [26]:
ddf.columns=['text','predicted_senti','true_senti']

In [28]:
ddf[ddf['predicted_senti']!=ddf['true_senti']].head(10)

Unnamed: 0,text,predicted_senti,true_senti
6,'믿고안보는 손예진..',1,0
10,'조조영화 끝나기도전에 평점 올라오는건 뭐지',1,0
15,'몇몇 개그로 웃긴 장면 빼고는 그다지 재미가 없음.',1,0
16,'똥구멍은 음식의 영양분을 흡수 후 남은 찌거기와 가스가 배출되는 곳. 주용도에 맞...,1,0
20,'평점조작하는건 쫌 아니자나 아무리그래도',0,1
24,'또또 이상하게 흥행하네 요즘 한국영화가 얼마나 노잼이면 이런 영화가 빨리다니',1,0
26,'10자쓰기도아깝다10자',1,0
27,'네이버네티즌 평점은 못믿겠어요 굉장히 비합리적 평론가 관람객 평점만 남기길',1,0
28,'이 영화 볼빠에 비행기 왕복으로 타는게 이득입니다',1,0
34,'기대이하네요. 한국에선 염력이란 소재가 참신한데 풀어내는 방식과 스토리가 너무 진...,1,0


In [29]:
ddf[ddf['predicted_senti']==ddf['true_senti']].head(10)

Unnamed: 0,text,predicted_senti,true_senti
0,"'안 지루했고 아프리카, 여자, 흑인을 주요소로 신선하게 이야기를 풀어나가는게 도전...",1,1
1,'엑스포스 다 죽을때 웃겨 죽는줄 ㅎㅎ',1,1
2,'3탄 나오겠네... 이거 3탄 안나오면 소비자 기만 수준임. 판을 아주 다 깔아 ...,1,1
3,"'안도 사쿠라의 연기를 보면 ""어쩜 저렇게 연기를 잘할까""라는 감탄이 아니라 ""정말...",1,1
4,'7살아들이 너무 재미있게 봤어요',1,1
5,'미소는 담배와 위스키한잔으로 겨우 버티는것이다. 집 없어도 내 인생이 있다며 자족...,1,1
7,'이 영화보고 7년 사귀고 헤어진 여자친구 다시 만날 수 있었습니다. 현실적이라 좋...,1,1
8,'감동적이고 너무 재미있었어요눈물도 났음ㅋ',1,1
9,'장 꼬여서 방구 존내 나옴.',1,1
11,'난 넘 재밌게 봤는데 3편 꼭 나왔으면',1,1
