In [36]:
%cd /content/drive/MyDrive/multi/0428

/content/drive/MyDrive/multi/0428


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install konlpy > /dev/null

### 네이버 영화평 감성 분석

In [5]:
import numpy as np
import pandas as pd
import konlpy

In [6]:
# 네이버 영화 리뷰 검색

train = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', sep='\t')
test = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', sep='\t')

In [7]:
print(train.shape, test.shape)
train.head()

(150000, 3) (50000, 3)


Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


### 1. 데이터 전처리
- train data

In [8]:
#Null 데이터가 있는 지 확인
train.isna().sum()

id          0
document    5
label       0
dtype: int64

In [10]:
train.dropna(how='any', inplace=True)
train.shape

(149995, 3)

In [11]:
# 중복 여부 확인
train.document.nunique()

146182

In [14]:
# 중복 제거
train.drop_duplicates(subset=['document'], inplace=True)
train.shape

(146182, 3)

In [16]:
train.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

- test data

In [17]:
test.isna().sum()

id          0
document    3
label       0
dtype: int64

In [18]:
test.dropna(how='any', inplace=True)

In [19]:
test.document.nunique()

49157

In [21]:
test.drop_duplicates(subset=['document'], inplace=True)
test.shape

(49157, 3)

In [22]:
test.label.value_counts()

1    24711
0    24446
Name: label, dtype: int64

### 2. 텍스트 전처리

In [24]:
train.document = train.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ').str.strip()
train.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다 평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [28]:
train.document.replace('',np.NAN, inplace=True)
train.document.isna().sum()

789

In [29]:
train.dropna(how='any', inplace=True)
train.shape

(145393, 3)

In [30]:
test.document = test.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ').str.strip()
test.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,만 아니었어도 별 다섯 개 줬을텐데 왜 로 나와서 제 심기를 불편하게 하죠,0


In [31]:
test.document.replace('',np.NAN, inplace=True)
test.document.isna().sum()

305

In [32]:
test.dropna(how='any', inplace=True)
test.shape

(48852, 3)

- 전처리가 끝난 데이터 저장

In [37]:
train.to_csv('naver_movie_train_전처리완료.tsv', sep='\t', index=False)
test.to_csv('naver_movie_test_전처리완료.tsv', sep='\t', index=False)

### 3. 한글 처리

In [38]:
from konlpy.tag import Okt

okt = Okt()

text = "교도소 이야기구먼 솔직히 재미는 없다 평점 조정"
okt.morphs(text)

['교도소', '이야기', '구먼', '솔직히', '재미', '는', '없다', '평점', '조정']

In [39]:
okt.morphs(text, stem=True)

['교도소', '이야기', '구먼', '솔직하다', '재미', '는', '없다', '평점', '조정']

In [41]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']
' '.join([word for word in okt.morphs(text, stem=True) if word not in stopwords])

'교도소 이야기 구먼 솔직하다 재미 없다 평점 조정'

In [None]:
from tqdm.notebook import tqdm

X_train = []
for sentence in tqdm(train.document):
    morphs = okt.morphs(sentence, stem=True)
    tmp_str = ' '.join([word for word in morphs if word not in stopwords])
    X_train.append(tmp_str)

In [45]:
len(X_train)

145393

In [46]:
X_test = []
for sentence in tqdm(test.document):
    morphs = okt.morphs(sentence, stem=True)
    tmp_str = ' '.join([word for word in morphs if word not in stopwords])
    X_test.append(tmp_str)

  0%|          | 0/48852 [00:00<?, ?it/s]

In [47]:
len(X_test)

48852

### 4. feature 변환 + 모델 학습/평가

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [51]:
cvect = CountVectorizer()
lrc = LogisticRegression(random_state=2022)
pipeline = Pipeline([('CVECT', cvect), ('LR', lrc)])

In [58]:
y_train = train.label.values
y_test = test.label.values

In [59]:
%time pipeline.fit(X_train, y_train)

CPU times: user 6.13 s, sys: 4.14 s, total: 10.3 s
Wall time: 6.04 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('CVECT', CountVectorizer()),
                ('LR', LogisticRegression(random_state=2022))])

In [63]:
rev1 = '모든 국민이 봤으면 하는 영화입니다.'
rev2 = '생각보다 지루하고 별로였네요...... 보면서 좀 졸았습니다'

In [64]:
import re
rev1 = re.sub('[^가-힣]', ' ', rev1).strip()
rev2 = re.sub('[^가-힣]', ' ', rev2).strip()

In [65]:
rev1, rev2

('모든 국민이 봤으면 하는 영화입니다', '생각보다 지루하고 별로였네요       보면서 좀 졸았습니다')

In [67]:
morphs = okt.morphs(rev1, stem=True)
rev1 = ' '.join([word for word in morphs if word not in stopwords])

morphs = okt.morphs(rev2, stem=True)
rev2 = ' '.join([word for word in morphs if word not in stopwords])

In [68]:
pipeline.predict([rev1, rev2])

array([1, 0])

### 6. 최적 파라미터 찾기

In [70]:
from sklearn.model_selection import GridSearchCV

params = {
    'CVECT__ngram_range': [(1,1), (1,2)],
    'LR__C': [1,5]
}

In [72]:
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3, n_jobs=-1)
grid_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('CVECT', CountVectorizer()),
                                       ('LR',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'CVECT__ngram_range': [(1, 1), (1, 2)],
                         'LR__C': [1, 5]},
             scoring='accuracy')

In [73]:
grid_pipe.best_params_

{'CVECT__ngram_range': (1, 2), 'LR__C': 1}

In [74]:
best_pipe = grid_pipe.best_estimator_
best_pipe.score(X_test, y_test)

0.8479898468844673