In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
def tf_extractor(corpus):  
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [3]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [4]:
with open('2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs)
    # 둘을 분리해서 별도의 list 변수로 저장

FileNotFoundError: [Errno 2] No such file or directory: '2016_filtered_review.txt'

In [5]:
#import pandas as pd
df = pd.read_csv('merged_comments.csv')

In [12]:
scores = tuple(df.iloc[:,3])
texts = tuple(df.iloc[:,4])

In [13]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 4 <= score <= 8:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 4 -> 부정, -1
    # 8 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 8 else -1)

In [14]:
num_reviews = len(filtered_texts) #788189

num_train = int(num_reviews*0.7) #551732
# 전체 리뷰 중에서 70%를 training data로 사용하고, 나머지 30%를 test data로 사용
train_texts = filtered_texts[:num_train]
train_labels = filtered_labels[:num_train]
test_texts = filtered_texts[num_train+1:]
test_labels = filtered_labels[num_train+1:]

In [16]:
# You can also use the following method
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)

In [17]:
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
# input의 형태 = list of docs
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]
# tf_vectorizer.vocabulary_.items() returns a list of (word, frequency)
# We sort words based on their frequencies and save the words

In [18]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', random_state=0) # Lasso regression
# C = Inverse of regularization strength, 즉 C 값이 작을수록 penalty를 많이 준다는 것입니다.
# penalty를 많이 준다는 뜻은 L1 같은 경우는 feature의 수를 그만큼 많이 줄인다는 뜻이고
# L2인 경우는 weight 값을 더 0에 가깝게 한다는 뜻입니다.
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))



Misclassified samples: 19173 out of 170106
Accuracy: 0.89


In [19]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', random_state=0) # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 24484 out of 170106
Accuracy: 0.86


In [20]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)
# 학습에 사용된 각 단어마다의 coefficient (즉 weight) 값이 존재
# coefficient값이 큰 순으로 정렬 'reverse=True'

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(504279, 9.659400584263386), (440515, 8.851325781716714), (467113, 8.7756171900682), (352347, 8.07702074475683), (437736, 7.884602329433185)]
최고 (9.659)
재밌었어요 (8.851)
좋았어요 (8.776)
여운이 (8.077)
재밌게 (7.885)
재밌어요 (7.775)
꿀잼 (7.541)
재미있게 (7.511)
좋았습니다 (7.149)
눈물이 (7.026)
재밌었음 (6.832)
재밌고 (6.356)
좋고 (6.290)
재미있었어요 (6.211)
최고다 (6.084)
좋았고 (5.742)
좋았다 (5.712)
최고의 (5.666)
역시 (5.642)
존잼 (5.604)
재밌는데 (5.537)
마음이 (5.503)
봤어요 (5.427)
재밌다 (5.367)
좋았음 (5.341)
좋아요 (5.338)
대박 (5.315)
재밌음 (5.241)
강추 (5.150)
재미있어요 (5.121)
최고입니다 (5.024)
믿고보는 (4.943)
재밌었다 (4.823)
지루하지 (4.821)
모르고 (4.712)
개꿀잼 (4.676)
주지훈 (4.623)
재밌었습니다 (4.603)
오랜만에 (4.571)
감동 (4.447)
아름다운 (4.444)
아깝지 (4.431)
인생영화 (4.370)
소름 (4.352)
재밌습니다 (4.315)
있는 (4.289)
재미있고 (4.251)
감사합니다 (4.196)
가슴이 (4.176)
간만에 (4.091)
보다가 (-4.947)
ㅡㅡ (-5.008)
영화냐 (-5.055)
돈주고 (-5.072)
지루함 (-5.080)
고구마 (-5.108)
억지 (-5.182)
졸작 (-5.212)
개연성도 (-5.267)
이걸 (-5.291)
무슨 (-5.420)
내돈 (-5.448)
알바가 (-5.480)
재미없어서 (-5.481)
2점 (-5.521)
0점 (-5.616)
거르는 (-5.668)
알바 (-5.671)
알바들 (-5.6