##### 1. 데이터 탐색

In [120]:
import numpy as np
import pandas as pd

In [121]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t')

In [122]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [123]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t', quoting=3) #3 -> quote:None
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [125]:
df.review[0][:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

#### 2. 텍스트 전처리

In [126]:
# <br /> 태그는 공백으로
df.review = df.review.str.replace('<br />', ' ')

In [127]:
# 구둣점과 숫자 제거  영어 이외의 문자는 blank
df.review = df.review.str.replace('[^A-Za-z]', ' ', regex=True)

In [128]:
df.review[0][:200]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want '

#### 3. 데이터셋 분리

In [129]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review.values, df.sentiment, stratify=df.sentiment,
    test_size=0.2, random_state=2023
)
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([10000, 10000], dtype=int64))

#### 4. Text Encoding

In [130]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(stop_words='english')

In [131]:
# 아래와 같은 방법으로 하면 안됨
cvect.fit_transform(X_train).shape, cvect.fit_transform(X_test).shape

((20000, 66602), (5000, 37763))

In [132]:
# 올바른 사용법
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((20000, 66602), (5000, 66602))

#### 5. 학습 및 평가

In [133]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023, max_iter=500)

In [134]:
# 시간이 오래 걸리는 작업 - %time magic 명령어 사용
%time lrc.fit(X_train_cv, y_train)

CPU times: total: 5.27 s
Wall time: 5.33 s


In [135]:
lrc.score(X_test_cv,y_test)

0.8786

#### 6. Bigram

In [136]:
cvect2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
cvect2.fit(X_train)
X_train_cv2 = cvect2.transform(X_train)
X_test_cv2 = cvect2.transform(X_test)
X_train_cv2.shape, X_test_cv2.shape

((20000, 1455899), (5000, 1455899))

In [137]:
lrc2 = LogisticRegression(random_state=2023, max_iter=500)
%time lrc2.fit(X_train_cv2,y_train)

CPU times: total: 53.8 s
Wall time: 46.2 s


In [138]:
lrc2.score(X_test_cv2,y_test)

0.8896

#### 8. 모델 save / load

In [139]:
import joblib

In [140]:
joblib.dump(cvect2, 'model/imdb_cvect_2.pkl')
joblib.dump(lrc2, 'model/imdb_lrc2.pkl')


['model/imdb_lrc2.pkl']

In [141]:
new_cvect = joblib.load('model/imdb_cvect_2.pkl')
new_lrc = joblib.load('model/imdb_lrc2.pkl')

In [142]:
review = ['''
This isn't just a beautifully crafted gangster film.
Or an outstanding family portrait, for that matter.An amazing period piece.
A character study. A lesson in filmmaking and an inspiration to generations of actors, directors, screenwriters and producers.
For me, this is more: this is the definitive film.
10 stars out of 10.
''',
'''
I follow recommendations on this site highly. 
I rented this movie and wanted my money back. 
Ever been to one of those parties with distant relatives where you don't know anyone there and just sit in the corner waiting for it to end? 
If so, you've seen 90% of this movie. Throw in a few good scenes that happen so far apart, you forget the last one by the time you see the next one.
Might be worth watching once just to say you have, but you'll probably never watch it again. 
Definitely not "best movie ever material."
''']

In [143]:
import re
review = map(lambda x:re.sub('[^A-Za-z]', ' ', x),review)
# review = re.sub('[^A-Za-z]', ' ', review)

In [144]:
review_cv = new_cvect.transform(review)

In [145]:
review_cv.shape

(2, 1455899)

In [146]:
# '긍정' if new_lrc.predict(review_cv) ==1 else '부정'
new_lrc.predict(review_cv)

array([1, 1], dtype=int64)