In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
def tf_extractor(corpus):  
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [3]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [4]:
with open('2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

In [7]:
import pandas as pd
pd.read_csv('C:\Users\ahn92Dropbox/Sources/ODS/Final_project/')

FileNotFoundError: File b'C:/Users/ahn92/Dropbox/Sources/ODS/Final_project/merged_comments.csv' does not exist

In [5]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 <= score <= 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, -1
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 8 else -1)

In [6]:
num_reviews = len(filtered_texts) #788189

num_train = int(num_reviews*0.7) #551732
# 전체 리뷰 중에서 70%를 training data로 사용하고, 나머지 30%를 test data로 사용
train_texts = filtered_texts[:num_train]
train_labels = filtered_labels[:num_train]
test_texts = filtered_texts[num_train+1:]
test_labels = filtered_labels[num_train+1:]

In [7]:
# You can also use the following method
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)

In [8]:
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
# input의 형태 = list of docs
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]
# tf_vectorizer.vocabulary_.items() returns a list of (word, frequency)
# We sort words based on their frequencies and save the words

In [9]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', random_state=0) # Lasso regression
# C = Inverse of regularization strength, 즉 C 값이 작을수록 penalty를 많이 준다는 것입니다.
# penalty를 많이 준다는 뜻은 L1 같은 경우는 feature의 수를 그만큼 많이 줄인다는 뜻이고
# L2인 경우는 weight 값을 더 0에 가깝게 한다는 뜻입니다.
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))



Misclassified samples: 12191 out of 164523
Accuracy: 0.93


In [10]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', random_state=0) # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))



Misclassified samples: 14075 out of 164523
Accuracy: 0.91


In [11]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)
# 학습에 사용된 각 단어마다의 coefficient (즉 weight) 값이 존재
# coefficient값이 큰 순으로 정렬 'reverse=True'

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(50619, 8.697089720864168), (50561, 8.646307679099666), (8402, 7.592908058014507), (49991, 7.10802362991817), (58598, 6.985686877006735)]
재밌었 (8.697)
재밌게 (8.646)
꿀잼 (7.593)
재미있게 (7.108)
최고 (6.986)
재미있었 (6.688)
재밌어 (6.500)
여운 (6.410)
가슴 (5.960)
강추 (5.309)
대박 (4.949)
재밌 (4.743)
가는 (4.706)
감동 (4.420)
재밌고 (4.351)
재밌네 (4.212)
재미있어 (4.129)
감사합 (4.118)
역시 (4.083)
지루하지 (4.032)
존잼 (3.973)
눈물 (3.959)
지루할 (3.897)
마음 (3.882)
울었 (3.872)
재미있 (3.815)
유쾌 (3.814)
재밋 (3.699)
빠져 (3.674)
즐겁 (3.669)
심장 (3.652)
탄탄 (3.591)
재미있네 (3.589)
매력 (3.510)
굿굿 (3.466)
흥미진진 (3.408)
재밌는 (3.390)
충분히 (3.375)
최고다 (3.358)
아이 (3.350)
괜찮 (3.278)
테러 (3.171)
시키 (3.099)
약간 (3.087)
사랑 (3.023)
있었 (3.002)
재미있고 (2.946)
감사 (2.897)
평론가 (2.892)
전문가 (2.851)
유치 (-3.728)
감독 (-3.744)
졸다 (-3.757)
실망했 (-3.771)
만들 (-3.788)
거품 (-3.789)
클레멘타인 (-3.794)
졸려 (-3.811)
신파극 (-3.830)
지루해 (-3.871)
하품 (-3.893)
왜곡 (-4.126)
억지로 (-4.151)
그닥 (-4.178)
졸았 (-4.199)
어이 (-4.275)
발연기 (-4.282)
삼류 (-4.332)
아깝 (-4.377)
불륜 (-4.458)
아까운 (-4.484)
불면증 (-4.750)
이하 (-4.765