# 네이버 영화평 감성분석

In [1]:
!pip install konlpy > /dev/null

In [50]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import re
from konlpy.tag import Okt

In [5]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving word2vec-nlp-tutorial.zip to word2vec-nlp-tutorial.zip


In [30]:
train_url = "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt"
test_url = "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt"

train_df = pd.read_csv(train_url, sep='\t')
test_df = pd.read_csv(test_url, sep='\t')

train_df.shape, test_df.shape


((150000, 3), (50000, 3))

In [31]:
train_df.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


### 데이터 전처리

- 트레인 데이터셋

In [32]:
# 중복 여부 확인
train_df.nunique()

id          150000
document    146182
label            2
dtype: int64

In [33]:
# document column만 중복이 있음을 확인
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.nunique()

id          146183
document    146182
label            2
dtype: int64

In [34]:
# Null 데이터 확인
train_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [35]:
# document column에 한개의 Null 데이터가 있음을 확인하고 제거
train_df.dropna(subset=['document'], how='any', inplace=True)
train_df.isnull().sum()

id          0
document    0
label       0
dtype: int64

- 테스트 데이터셋

In [36]:
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.dropna(subset=['document'], how='any', inplace=True)

In [37]:
train_df.shape, test_df.shape

((146182, 3), (49157, 3))

## 텍스트 전처리

- 트레인 데이터셋

In [38]:
#한글과 공백을 제외하고 제거
train_df['document'] = train_df['document'].astype(str).apply(lambda x : re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', x))
test_df['document'] = test_df['document'].astype(str).apply(lambda x : re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', x))

In [39]:
train_df.document[1]

'흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나'

## 한글 처리

In [40]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving 한국어불용어100.txt to 한국어불용어100.txt


In [45]:
df = pd.read_csv(filename, sep='\s+', header=None)
df.columns = ['word', 'pos', 'prob']
df.to_csv('korean_stop_words.csv', index=False)

In [47]:
stw = pd.read_csv('korean_stop_words.csv')
stw.head()

Unnamed: 0,word,pos,prob
0,이,VCP,0.01828
1,있,VA,0.011699
2,하,VV,0.009774
3,것,NNB,0.009733
4,들,XSN,0.006898


In [49]:
stopwords = stw.word.values
print(len(stopwords))
print(stopwords)

100
['이' '있' '하' '것' '들' '그' '되' '수' '이' '보' '않' '없' '나' '사람' '주' '아니' '등' '같'
 '우리' '때' '년' '가' '한' '지' '대하' '오' '말' '일' '그렇' '위하' '때문' '그것' '두' '말하'
 '알' '그러나' '받' '못하' '일' '그런' '또' '문제' '더' '사회' '많' '그리고' '좋' '크' '따르' '중'
 '나오' '가지' '씨' '시키' '만들' '지금' '생각하' '그러' '속' '하나' '집' '살' '모르' '적' '월' '데'
 '자신' '안' '어떤' '내' '내' '경우' '명' '생각' '시간' '그녀' '다시' '이런' '앞' '보이' '번' '나'
 '다른' '어떻' '여자' '개' '전' '들' '사실' '이렇' '점' '싶' '말' '정도' '좀' '원' '잘' '통하'
 '소리' '놓']


In [51]:
from tqdm.notebook import tqdm
okt = Okt()

str_train = []
for sentence in tqdm(train_df.document):
    morphs = okt.morphs(sentence.strip(), stem=True)
    temp_str = ' '.join(word for word in morphs if word not in stopwords)
    str_train.append(temp_str)

  0%|          | 0/146182 [00:00<?, ?it/s]

In [53]:
str_test = []
for sentence in tqdm(test_df.document):
    morphs = okt.morphs(sentence.strip(), stem=True)
    temp_str = ' '.join(word for word in morphs if word not in stopwords)
    str_test.append(temp_str)

  0%|          | 0/49157 [00:00<?, ?it/s]

In [58]:
y_train = train_df.label.values
y_test = test_df.label.values

len(str_train), len(y_train), len(str_test), len(y_test)

(146182, 146182, 49157, 49157)

## Feature 변환

- CountVectorizer

In [59]:
cvect = CountVectorizer()
cvect.fit(str_train)
X_train = cvect.transform(str_train)
X_test = cvect.transform(str_test)

X_train.shape, X_test.shape

((146182, 42092), (49157, 42092))