In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
def tf_extractor(corpus):  
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [3]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [4]:
with open('2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [5]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 <= score <= 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, -1
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 8 else -1)

In [6]:
num_reviews = len(filtered_texts) #788189

num_train = int(num_reviews*0.7) #551732
# 전체 리뷰 중에서 70%를 training data로 사용하고, 나머지 30%를 test data로 사용
train_texts = filtered_texts[:num_train]
train_labels = filtered_labels[:num_train]
test_texts = filtered_texts[num_train+1:]
test_labels = filtered_labels[num_train+1:]

In [7]:
# You can also use the following method
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)

In [7]:
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
# input의 형태 = list of docs
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]
# tf_vectorizer.vocabulary_.items() returns a list of (word, frequency)
# We sort words based on their frequencies and save the words

In [9]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', random_state=0) # Lasso regression
# C = Inverse of regularization strength, 즉 C 값이 작을수록 penalty를 많이 준다는 것입니다.
# penalty를 많이 준다는 뜻은 L1 같은 경우는 feature의 수를 그만큼 많이 줄인다는 뜻이고
# L2인 경우는 weight 값을 더 0에 가깝게 한다는 뜻입니다.
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))



Misclassified samples: 10766 out of 164522
Accuracy: 0.93


In [15]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', random_state=0) # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Misclassified samples: 12071 out of 164522
Accuracy: 0.93


In [10]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)
# 학습에 사용된 각 단어마다의 coefficient (즉 weight) 값이 존재
# coefficient값이 큰 순으로 정렬 'reverse=True'

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(8296, 2.724390737323108), (50360, 2.5960653247932326), (50295, 2.5225153393315187), (49708, 2.5213259573042963), (49727, 2.370122339054413)]
꿀잼 (2.724)
재밌었 (2.596)
재밌게 (2.523)
재미있게 (2.521)
재미있었 (2.370)
재밌어 (2.360)
재미있어 (2.052)
여운 (2.024)
재밌네 (1.985)
재미있네 (1.944)
존잼 (1.884)
강추 (1.866)
재밌고 (1.839)
최고 (1.750)
재미있 (1.740)
재밌 (1.735)
재밋어 (1.725)
테러 (1.701)
최고다 (1.688)
재밌던 (1.679)
지루하지 (1.670)
즐겁 (1.667)
굿굿 (1.665)
재밋 (1.657)
낮아 (1.655)
개꿀잼 (1.631)
지루할 (1.608)
흥미진진 (1.585)
대박 (1.582)
수작 (1.575)
재밋음 (1.539)
울었 (1.522)
재미있고 (1.504)
빠져 (1.501)
졸잼 (1.477)
감탄 (1.475)
가는 (1.474)
가슴 (1.470)
심장 (1.462)
유쾌 (1.423)
재미나 (1.412)
사랑해 (1.388)
탄탄 (1.379)
재밌는 (1.376)
만점 (1.351)
충분히 (1.347)
즐거운 (1.330)
만족 (1.322)
케미 (1.309)
슬펐 (1.305)
팔이 (-1.505)
삼류 (-1.507)
억지로 (-1.532)
알바 (-1.552)
클레멘타인 (-1.567)
자다 (-1.593)
미화 (-1.599)
망했 (-1.601)
짜증 (-1.621)
졸다 (-1.632)
신파극 (-1.643)
아깝 (-1.644)
나가고 (-1.646)
하품 (-1.667)
왜곡 (-1.669)
별루 (-1.670)
아까웠 (-1.680)
망쳐 (-1.683)
없고 (-1.735)
불륜 (-1.743)
거품 (-1.771)
지루해 (-1.773)
실망했 