In [85]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline

In [86]:
## 주피터노트북 cell 너비 조정

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

# 영화 댓글을 이용한 감성 분석

> ## 데이터 다운로드

- windows

In [None]:
# !bitsadmin /transfer get https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt %cd%\ratings_train.txt
# !bitsadmin /transfer get https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt %cd%\ratings_test.txt    

-  mac / linux

In [None]:
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

-------

In [1]:
import codecs
import numpy as np
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    train = [line.split('\t') for line in f.read().splitlines()]
    train = train[1:]   # header 제외
f.close()

In [2]:
with codecs.open("ratings_test.txt", encoding='utf-8') as f:
    test = [line.split('\t') for line in f.read().splitlines()]
    test = test[1:]   # header 제외
f.close()

# 나이브베이즈 모델을 이용한 감성 분석 (Sentiment Analysis)
- 네이버 영화 리뷰
    - 평점 3점 이상이면 긍정 / 3점 미만이면 부정
- 사전확률 계산
- likelihood 계산
- 모델 실행

In [3]:
train[:3]

[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['10265843', '너무재밓었다그래서보는것을추천한다', '0']]

In [4]:
test[:3]

[['6270596', '굳 ㅋ', '1'],
 ['9274899', 'GDNTOPCLASSINTHECLUB', '0'],
 ['8544678', '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', '0']]

## 1. 사전확률 계산
- ## $ P(y = C_k) $

In [5]:
def count_target(training_set):
    from collections import defaultdict, Counter
    counts = defaultdict(int)
    for i, row in enumerate(training_set):
        sentiment = row[2]
        if sentiment == '1':
            counts[sentiment] += 1
        else:
            counts[sentiment] += 1
    return counts

In [6]:
count_target(train)

defaultdict(int, {'0': 75173, '1': 74827})

In [7]:
def prob_target(training_set):
    pos_prob = count_target(test)['1']/sum(count_target(test).values())
    neg_prob = count_target(test)['0']/sum(count_target(test).values())
    return pos_prob, neg_prob

In [8]:
prob_target(test)

(0.50346, 0.49654)

## 2. likelihood 계산
- ## $P(x|y=C_k)$
- Laplace Smoothing
    - ### $\frac {N_i + \alpha}{N+\alpha K}$
    - 매우 작은 값을 추가하여 값이 0이 되지 않도록 한다. 
    - 여러값의 곱을 취할 경우 하나만 0이되면 전체가 0이 되는 문제가 있기 때문이다.
        - ex. '메가박스'라는 단어가 긍정을 표현한 리뷰에만 포함된 경우
            - P(메가박스 | C_neg) = 0 이 되므로 P(C_neg | 메가박스) 도 0이 되어버린다.
            - 라플라스 스무딩 적용 시, P(메가박스 | C_neg) = k / n+k 가 된다.

### (1) 단어별 카운트

In [9]:
def count_words(training_set):
    from collections import defaultdict
    counts = defaultdict(lambda: [0, 0])
    for i, row in enumerate(training_set):
        review_words = row[1].split()
        sentiment = row[2]
        for word in review_words:
            counts[word][0 if sentiment == '1' else 1] += 1 ## word count | C_pos, word count | C_neg
    return counts

In [10]:
{'정우성' : [2, 3]}

{'정우성': [2, 3]}

### (2) 단어별 확률계산

In [11]:
def word_probabilities(counts, target, k=1):
    """laplace smoothing 적용"""
    from collections import defaultdict
    probabilities = defaultdict(dict)
    total_pos = target['1']
    total_neg = target['0']
    for w, (positive, negative) in counts.items():
        probabilities[w] = ((positive + k) / (total_pos + k),  ## P(word | C_pos)
                            (negative + k) / (total_neg + k))  ## P(word | C_neg)
        
    return probabilities

In [12]:
word_probabilities(count_words(train), count_target(train))['정우성']

(0.0002271876837547442, 0.00017293213078990076)

## 3. 사후확률 계산
- ## $P(y=C_k|x)$
- 언더플로우 방지를 위한 log 연산
    - 확률 계산 시 확률을 계속 곱해주게되면 매우 낮은 값들니 나와 언더플로우 현상이 발생할 수 있다.
    - 이를 방지하기 위해 $P(x∣y=C_{ k })$를 $log(P(x∣y=C_{ k }))$ 로 바꿔서 연산한다. 
    - ## $log(P(x∣y=C_{ k })) =\sum _{i=1}^{P}log({P(x_j|y=C_k)})$

In [13]:
def sentiment_probability(word_probs, prob_target, review):
    import math
    review_words = review.split()
    log_prob_if_neg = log_prob_if_pos = 0.0
    pos_prob = prob_target[0]
    neg_prob = prob_target[1]
    
    for word in review_words:
        # 긍정 확률
        if word in word_probs:
            log_prob_if_pos += math.log(word_probs[word][0])
            log_prob_if_neg += math.log(word_probs[word][1])
        else:
            pass
#             log_prob_if_pos += math.log((0 + 1)/self.total_pos + 1)
#             log_prob_if_neg += math.log((0 + 1)/self.total_neg + 1)
    prob_if_pos = log_prob_if_pos + math.log(pos_prob) ## log(P(x|C_pos)P(C_pos))
    prob_if_neg = log_prob_if_neg + math.log(neg_prob) ## log(P(x|C_neg)P(C_neg))
    return prob_if_pos , prob_if_neg

In [14]:
word_probs = word_probabilities(count_words(train), count_target(train))

In [15]:
target_prob = prob_target(train)

In [16]:
test[2]

['8544678', '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', '0']

In [17]:
pos_proba, neg_proba = sentiment_probability(word_probs, target_prob, test[2][1])
print('긍정 리뷰 확률 : ', pos_proba)
print('부정 리뷰 확률 : ', neg_proba)

긍정 리뷰 확률 :  -62.30355173446977
부정 리뷰 확률 :  -59.931758953659916


## 3. 모델링

In [21]:
class NaiveBayesClassifier:
    def __init__(self, k = 1):
        self.k = k
        
    def count_target(self, training_set):
        from collections import defaultdict
        counts = defaultdict(int)
        for i, row in enumerate(training_set):
            sentiment = row[2]
            if sentiment == '1':
                counts[sentiment] += 1
            else:
                counts[sentiment] += 1
        return counts
    
    def prob_target(self, training_set):
        pos_prob = self.count_target(training_set)['1']/sum(self.count_target(training_set).values())
        neg_prob = self.count_target(training_set)['0']/sum(self.count_target(training_set).values())
        return pos_prob, neg_prob

    def count_words(self, training_set):
        from collections import defaultdict
        counts = defaultdict(lambda: [0, 0])
        for i, row in enumerate(training_set):
            review_words = row[1].split()
            sentiment = row[2]
            for word in review_words:
                counts[word][0 if sentiment == '1' else 1] += 1      
        return counts

    def word_probabilities(self, counts, target, k=1):
        from collections import defaultdict
        """laplace smoothing 적용"""
        probabilities = defaultdict(dict)
        self.total_pos = target['1']
        self.total_neg = target['0']
        for w, (positive, negative) in counts.items():
            probabilities[w] = ((positive + k) / (self.total_pos + k), 
                                (negative + k) / (self.total_neg + k))

        return probabilities

    def sentiment_probability(self, word_prob, review):
        import math
        review_words = review.split()
        log_prob_if_neg = log_prob_if_pos = 0.0
        pos_prob = self.pos_prob
        neg_prob = self.neg_prob
        
        for word in review_words:
            # 긍정 확률
            if word in word_prob:
                log_prob_if_pos += math.log(word_prob[word][0])
                log_prob_if_neg += math.log(word_prob[word][1])
            else:
                log_prob_if_pos += math.log((0 + self.k)/self.total_pos + self.k)
                log_prob_if_neg += math.log((0 + self.k)/self.total_neg + self.k)
                
        prob_if_pos = log_prob_if_pos + math.log(pos_prob)
        prob_if_neg = log_prob_if_neg + math.log(neg_prob)
        return prob_if_pos , prob_if_neg
    
    def train(self, training_set):
        self.pos_prob, self.neg_prob = self.prob_target(training_set)
        word_counts = self.count_words(training_set)
        target_counts = self.count_target(training_set)
        self.word_prob = self.word_probabilities(word_counts, target_counts, k = self.k)
    
    def predict(self, review):
        prob_if_pos , prob_if_neg = self.sentiment_probability(self.word_prob, review)
        if prob_if_pos > prob_if_neg:
            return '1'
        else:
            return '0'

## 4. Training

In [24]:
classifier = NaiveBayesClassifier()

In [25]:
classifier.train(train)

## 5. 결과

In [26]:
true_target = []
predicted_target = []
for i, row in enumerate(test):
    true_target.append(row[2])
    predicted_target.append(classifier.predict(row[1]))

In [27]:
from sklearn.metrics import classification_report

print(classification_report(true_target, predicted_target))

             precision    recall  f1-score   support

          0       0.79      0.83      0.81     24827
          1       0.82      0.78      0.80     25173

avg / total       0.81      0.80      0.80     50000



## Scikit-learn

In [28]:
X = list(zip(*train))[1]
y = np.array(list(zip(*train))[2], dtype=int)

In [29]:
X_test = list(zip(*test))[1]
y_test = np.array(list(zip(*test))[2], dtype=int)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
## http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

model = Pipeline([
            ('vect', CountVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [32]:
model.fit(X, y)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [33]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.82     25173

avg / total       0.83      0.83      0.83     50000



In [34]:
## Pipeline 없이 사용

vectorizer = CountVectorizer().fit(X)
x_vec = vectorizer.transform(X)
model = MultinomialNB().fit(x_vec, y)

In [30]:
print(classification_report(y_test, model.predict(vectorizer.transform(X_test))))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.82     25173

avg / total       0.83      0.83      0.83     50000



----------

# 텍스트에서 노이즈 제거 후 모델 성능 평가

In [31]:
from konlpy.tag import Twitter
t = Twitter()

In [32]:
t.tagset

{'Adjective': '형용사',
 'Adverb': '부사',
 'Alpha': '알파벳',
 'Conjunction': '접속사',
 'Determiner': '관형사',
 'Eomi': '어미',
 'Exclamation': '감탄사',
 'Foreign': '외국어, 한자 및 기타기호',
 'Hashtag': '트위터 해쉬태그',
 'Josa': '조사',
 'KoreanParticle': '(ex: ㅋㅋ)',
 'Noun': '명사',
 'Number': '숫자',
 'PreEomi': '선어말어미',
 'Punctuation': '구두점',
 'ScreenName': '트위터 아이디',
 'Suffix': '접미사',
 'Unknown': '미등록어',
 'Verb': '동사'}

- 텍스트 전처리
    - 조사 등 의미없는 형태소는 제외

In [34]:
tag_set = ['Adjective', 'Adverb', 'Noun', 'Verb']

In [82]:
import copy
from tqdm import tqdm
train_proc = copy.deepcopy(train)
test_proc = copy.deepcopy(test)

for _, row in tqdm(enumerate(train_proc)):
    row[1] = ' '.join([pos[0] for pos in t.pos(row[1]) if pos[1] in tag_set])
    
for _, row in tqdm(enumerate(test_proc)):
    row[1] = ' '.join([pos[0] for pos in t.pos(row[1]) if pos[1] in tag_set])

150000it [05:20, 467.42it/s]
50000it [02:25, 344.46it/s]


In [83]:
train_proc[1]

['3819312', '흠 포스터 보고 초딩 영화 줄 오버 연기 가볍 않구', '1']

In [84]:
train[1]

['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1']

- 모델 학습

In [77]:
classifier_1 = NaiveBayesClassifier()
classifier_1.train(train_proc)

In [78]:
true_target = []
predicted_target = []
for i, row in enumerate(test_proc):
    true_target.append(row[2])
    predicted_target.append(classifier_1.predict(row[1]))

In [79]:
from sklearn.metrics import classification_report
print(classification_report(true_target, predicted_target))

             precision    recall  f1-score   support

          0       0.81      0.87      0.84     24827
          1       0.86      0.80      0.83     25173

avg / total       0.84      0.84      0.84     50000



### 텍스트 데이터 분석시, 전처리(노이즈 제거)가 매우 중요하다.