# IMDB 영화평 감성분석
- Pipeline
- TfidfVectorizer + LogisticRegression

In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
df.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


- 텍스트 전처리

In [4]:
# <br(줄바꿈) /> 태그는 공백으로 처리
df.review = df.review.str.replace('<br />', ' ')

In [5]:
# 영문자 이외는 공백으로 변환
# 문자열일 때 str 쓰는건가
df.review = df.review.str.replace('[^A-Za-z]',' ').str.strip()

  df.review = df.review.str.replace('[^A-Za-z]',' ').str.strip()


- Train/Test dataset으로 분리

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review, df.sentiment, stratify=df.sentiment, random_state=2022
)

- Pipeline: TfidfVectorizer + LogisticRegression

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [10]:
pipeline = Pipeline([
    ('tvect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr', LogisticRegression(random_state=2022))
])

In [11]:
tvect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
lr = LogisticRegression(random_state=2022)
pipeline = Pipeline([('tvect',tvect), ('lr',lr)])

In [12]:
# 학습
%time pipeline.fit(X_train, y_train)

CPU times: user 57 s, sys: 9.98 s, total: 1min 7s
Wall time: 16.1 s


Pipeline(steps=[('tvect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('lr', LogisticRegression(random_state=2022))])

In [13]:
# 평가
pipeline.score(X_test, y_test)

0.87472

- 최작 하이퍼 파마미터 찾기

In [17]:
from sklearn.model_selection import GridSearchCV
params = {
    'tvect__max_df': [100,500],
    'lr__C': [1,10]
}

In [18]:
grid_pipe = GridSearchCV(
    pipeline, param_grid= params, scoring='accuracy', cv=3, n_jobs=-1
)
%time grid_pipe.fit(X_train, y_train)

CPU times: user 1min 11s, sys: 18.3 s, total: 1min 29s
Wall time: 54.6 s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lr',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10], 'tvect__max_df': [100, 500]},
             scoring='accuracy')

In [19]:
grid_pipe.best_params_

{'lr__C': 10, 'tvect__max_df': 500}

In [20]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.87552

- 모델 저장

In [21]:
import joblib
joblib.dump(grid_pipe.best_estimator_, 'model/imdb_pipe.pkl')

['model/imdb_pipe.pkl']

In [22]:
best_pipe = joblib.load('model/imdb_pipe.pkl')

In [None]:
review = '''
'''

In [None]:
# 텍스트 전처리
import re
clean_review = re.sub('[^A-Za-z]',' ',review).strip()

In [None]:
best_pipe