In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# 감성 분석(Sentiment Analysis)

- 문서의 주관적인 감성/의견/감정/기분 등을 파악하기 위한 방법
- 소셜미디어, 여론조사, 온라인 리뷰, 피드백 등 다양한 분야에서 활용
- 문서 내 텍스트가 나타내는 여러 가지 주관적인 단어와 문맥을 기반으로 감성(Sentiment) 수치를 계산하는 방법을 이용
- 감정 지수는 긍정 감성지수와 부정 감성지수로 구분로 구성되며 이들 지수를 합산해 긍정 감성 또는 부정 감성을 결정함

**감성 분석 방식**
- 지도학습 방식의 감성 분석
    - 학습 데이터와 타깃 레이블 값을 기반으로 감성 분석 학습을 수행한 뒤 이를 기반으로 다른 데이터의 감성 분석을 예측하는 방법
    - 일반적인 텍스트 기반의 분류와 거의 동일함   
- 비지도학습 방식의 감성 분석
    - Lexicon과 같은 감성 어휘 사전을 이용하여 문서의 부정적, 긍정적 감성 여부를 판단
    - Lexicon은 감성 분석을 위한 용어와 문맥에 대한 다양한 정보를 가지고 있음

### 실습. IMDB 영화리뷰 분석

- 영화평의 텍스트를 분석해 감성 분석 결과가 긍정 또는 부정인지를 예측하는 모델


- 사용 데이터 : IMDB 영화 사이트의 영화평
    - 출처 : https://www.kaggle.com/c/word2vec-nlp-tutorial/data
    - 캐글 사이트 로그인 후 Download All 선택하여 다운로드

#### 데이터 준비

In [4]:
review_df = pd.read_csv('data/IMDB/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


#### 로드된 데이터의 피처

- id : 각 데이터의 id
- sentiment : 영화평(review)의 Sentiment 결과 값(Target Label). 1은 긍정적 평가, 0은 부정적 평가
- review : 영화평 텍스트

In [7]:
review_df.review[0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

#### 데이터 정제 및 전처리

In [8]:
import re

review_df['clean'] = review_df.review.str.replace('<br />', ' ')
review_df.clean[0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.  Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.  The actual feature film bit when it finally starts is only on f

In [9]:
review_df['clean']= review_df.clean.apply(lambda x: re.sub('[^a-zA-Z]', ' ',x))
review_df.clean[0]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for 

#### 학습 데이터와 테스트 데이터로 분리

In [12]:
from sklearn.model_selection import train_test_split

x= review_df[['clean']]
y = review_df['sentiment']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=156)

#### 피처벡터화 후 분류 모델 예측

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline

#### Count 벡터화 적용 후 분류모델 예측
- Pipeline 객체 사용
- 분류모델 : 로지스틱회귀

In [16]:
pipe = Pipeline([('CNT', CountVectorizer(stop_words='english', ngram_range= (1,2))),
                 ('LR', LogisticRegression(C=10))])
pipe.fit(x_train.clean, y_train)
pred = pipe.predict(x_test.clean)
pred_proba = pipe.predict_proba(x_test.clean)[:,1]

acc = accuracy_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred_proba)

print(f'confusion matrix\n{confusion_matrix(y_test, pred)}')
print(classification_report(y_test, pred, digits=4))
print(f'Accuracy: {acc:.4f}, ROC_AUC: {roc_auc:.4f}')

confusion matrix
[[3250  430]
 [ 425 3395]]
              precision    recall  f1-score   support

           0     0.8844    0.8832    0.8838      3680
           1     0.8876    0.8887    0.8882      3820

    accuracy                         0.8860      7500
   macro avg     0.8860    0.8859    0.8860      7500
weighted avg     0.8860    0.8860    0.8860      7500

Accuracy: 0.8860, ROC_AUC: 0.9503


#### TF-IDF 벡터화를 적용해 분류모델 예측

In [17]:
pipe = Pipeline([('TFIDF', TfidfVectorizer(stop_words='english', ngram_range= (1,2))),
                 ('LR', LogisticRegression(C=10))])
pipe.fit(x_train.clean, y_train)
pred = pipe.predict(x_test.clean)
pred_proba = pipe.predict_proba(x_test.clean)[:,1]

acc = accuracy_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred_proba)

print(f'confusion matrix\n{confusion_matrix(y_test, pred)}')
print(classification_report(y_test, pred, digits=4))
print(f'Accuracy: {acc:.4f}, ROC_AUC: {roc_auc:.4f}')

confusion matrix
[[3257  423]
 [ 375 3445]]
              precision    recall  f1-score   support

           0     0.8968    0.8851    0.8909      3680
           1     0.8906    0.9018    0.8962      3820

    accuracy                         0.8936      7500
   macro avg     0.8937    0.8934    0.8935      7500
weighted avg     0.8936    0.8936    0.8936      7500

Accuracy: 0.8936, ROC_AUC: 0.9598


-----