In [2]:
import re
import os
import time
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import numpy as np
import konlpy.tag
import nltk

from konlpy.tag import Komoran
from konlpy.tag import Twitter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

In [4]:
df = pd.read_csv('merged_df/merged_comments.csv')

In [6]:
MvDf = pd.read_csv('MvDf.csv')

In [7]:
set(MvDf['Genre'])

{'SF',
 nan,
 '가족',
 '공연실황',
 '공포',
 '다큐멘터리',
 '드라마',
 '멜로/애정/로맨스',
 '모험',
 '무협',
 '뮤지컬',
 '미스터리',
 '범죄',
 '블랙코미디',
 '서스펜스',
 '스릴러',
 '애니메이션',
 '액션',
 '에로',
 '전쟁',
 '코미디',
 '판타지'}

In [8]:
Genre = '액션'

In [31]:
df = pd.read_csv('merged_df/'+Genre+'merged_comments.csv',engine='python',encoding='UTF8')

In [33]:
scores = tuple(df.iloc[:,3])
texts = tuple(df.iloc[:,4])

In [34]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 <= score <= 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, -1
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 8 else 0)

In [35]:
# train test split
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)

In [36]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [37]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)

In [38]:
train_tfidf_features.shape, test_tfidf_features.shape

((115838, 202611), (49645, 202611))

In [39]:
len(train_labels), len(test_labels)

(115838, 49645)

In [40]:
print(train_labels.count(0),train_labels.count(1),'\n',test_labels.count(0),test_labels.count(1))

19307 96531 
 8491 41154


In [41]:
%%time
lr = LogisticRegression(C=0.1, penalty='l1', random_state=0) # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
lr_pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((lr_pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, lr_pred_labels))



Misclassified samples: 6961 out of 49645
Accuracy: 0.86
Wall time: 773 ms


In [42]:
%%time
MNB=MultinomialNB()
MNB.fit(train_tfidf_features, train_labels) 
nb_pred_labels = MNB.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((nb_pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, nb_pred_labels))

Misclassified samples: 7246 out of 49645
Accuracy: 0.85
Wall time: 135 ms


In [43]:
%%time
sgd=SGDClassifier()
sgd.fit(train_tfidf_features, train_labels) 
sgd_pred_labels = sgd.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((sgd_pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, sgd_pred_labels))



Misclassified samples: 6895 out of 49645
Accuracy: 0.86
Wall time: 323 ms


In [44]:
'''
%%time
knn=KMeans(n_clusters=2)
knn.fit(train_tfidf_features, train_labels) 
pred_labels = knn.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))
'''

"\n%%time\nknn=KMeans(n_clusters=2)\nknn.fit(train_tfidf_features, train_labels) \npred_labels = knn.predict(test_tfidf_features)\nprint('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))\nprint('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))\n"

In [45]:
## 2h 43m accuracy 0.86
'''
%%time
from sklearn.ensemble import RandomForestClassifier
RFT = RandomForestClassifier(n_estimators =100)
RFT.fit(train_tfidf_features, train_labels) 
pred_labels = RFT.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))
'''

"\n%%time\nfrom sklearn.ensemble import RandomForestClassifier\nRFT = RandomForestClassifier(n_estimators =100)\nRFT.fit(train_tfidf_features, train_labels) \npred_labels = RFT.predict(test_tfidf_features)\nprint('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))\nprint('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))\n"

In [46]:
'''
%%time
from sklearn.svm import SVC
sinvecma = SVC()
sinvecma.fit(train_tfidf_features, train_labels) 
pred_labels = sinvecma.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))
'''

"\n%%time\nfrom sklearn.svm import SVC\nsinvecma = SVC()\nsinvecma.fit(train_tfidf_features, train_labels) \npred_labels = sinvecma.predict(test_tfidf_features)\nprint('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))\nprint('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))\n"

In [47]:
ddf = pd.DataFrame([test_texts,list(nb_pred_labels),test_labels]).transpose()

In [48]:
ddf.columns=['text','predicted_senti','true_senti']

In [49]:
ddf[ddf['predicted_senti']!=ddf['true_senti']].head(10)

Unnamed: 0,text,predicted_senti,true_senti
7,"'장동건 얼굴 1점, 현빈 얼굴 1점씩 2점.'",1,0
17,'진짜 개연성도 내용도 없다',1,0
21,'나 원래 마블 믿고봣는대OO이건 쫌 아냐 케릭터들도 다 논리 모순 오지고 영화가 ...,1,0
39,'흠ㅜㅜ....;;;',1,0
41,'마블영화의 오점오락영화도아닌것이 아주 지루함',1,0
44,'이 영화보다 게임영상 보는게 더 재미있는거 실화냐jQuery224007125229...,1,0
46,"'이거저거 다 담아보려다가 망한 영화, 임팩트도 없고 늘어지고 같은 내용을 반복하다...",1,0
55,'전혀 와닿지 읺는 억지스런 내용은 물론 집중하기 힘든 어색한 연기들의 연속.. 선...,1,0
63,'영화보는동안 한번도 재미있는 순간이 없다...',1,0
78,'해리포터 수준 아이들 영화좋아하면 강추 어른이보기엠 최악',1,0


In [50]:
ddf[ddf['predicted_senti']==ddf['true_senti']].head(10)

Unnamed: 0,text,predicted_senti,true_senti
0,'초반부에 ost나올때 소름돋음..액션 스토리 유머 빠지는게 하나도 없어요!! 스케...,1,1
1,'완전 재미있었습니다!!',1,1
2,'저는 정말 재밌게 봤어요!!',1,1
3,'재밌음 2시간넘는 시간 지루할틈이없어요 ㅋㅋ',1,1
4,"'최고의 액션, 노 스턴트, 시원하고 짜릿하다. 월타 gv로 봤고 강추합니다'",1,1
5,'역시 미션임파서벌 최고',1,1
6,'와..진짜 시간 가는줄 모르고봄..첨엔 먼데이 이해안갔는데 먼데이 죽을때 이해했음...,1,1
8,'정말정말 재밌어요!!!굿입니다용ㅎㅎ',1,1
9,'난 다 좋았다...캐릭터들도..조인성도.',1,1
10,'역쉬 마블!!!기대를져버리지않네요~~재미지게 잘봤습니다',1,1


In [51]:
tw_test = pd.read_csv('re_test.csv')
tw_test.iloc[:,1]
tw_lst=list(set(tw_test.iloc[:,1]))

In [57]:
tw_lst=['''시간 아깝다''']

In [58]:
tw_lst_features = tfidf_vectorizer.transform(tw_lst)

In [59]:
tw_nb_pred_labels = lr.predict(tw_lst_features)

In [60]:
tdf = pd.DataFrame([tw_lst,list(tw_nb_pred_labels)]).transpose()

In [61]:
tdf

Unnamed: 0,0,1
0,시간 아깝다,0
