In [3]:
%cd /content/drive/MyDrive/multi/0428

/content/drive/MyDrive/multi/0428


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('./train/trainData.tsv', sep='\t', quoting=3)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


- 텍스트 전처리

In [6]:
# br tag 공백으로 변환
df.review = df.review.str.replace('<br />', ' ').str.strip()

In [7]:
df.review = df.review.str.replace('[^a-zA-Z]',' ').str.strip()
df.review[0][:1000]

  """Entry point for launching an IPython kernel.


'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.review, df.sentiment, stratify=df.sentiment, random_state=2022
)

#### Pipeline
- TfidfVectorizer + Logistic Regression

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [11]:
tvect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
lrc = LogisticRegression(random_state=2022)

pipeline = Pipeline([('TVECT', tvect), ('LR', lrc)])

In [14]:
tvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [12]:
# 학습
%time pipeline.fit(X_train, y_train)

CPU times: user 36.6 s, sys: 14.4 s, total: 51 s
Wall time: 36.5 s


Pipeline(steps=[('TVECT',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('LR', LogisticRegression(random_state=2022))])

In [13]:
pipeline.score(X_test, y_test)

0.87472

- 최적 파라미터 찾기

In [15]:
from sklearn.model_selection import GridSearchCV

params = {
    'TVECT__max_df': [100, 500],
    'LR__C': [1,10]
}

In [18]:
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=5, n_jobs=-1)
grid_pipe.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('TVECT',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('LR',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'LR__C': [1, 10], 'TVECT__max_df': [100, 500]},
             scoring='accuracy')

In [19]:
print(grid_pipe.best_params_)
best_pipe = grid_pipe.best_estimator_
best_pipe.score(X_test, y_test)

{'LR__C': 10, 'TVECT__max_df': 500}


0.87552

- 모델 저장하고 불러오기

In [20]:
import joblib
joblib.dump(best_pipe, 'imdb_tvect_lr.pkl')

['imdb_tvect_lr.pkl']

In [22]:
!ls -l

total 152412
-rw------- 1 root root    12949 Apr 28 01:20 01_BagofWord.ipynb
-rw------- 1 root root    28909 Apr 28 02:49 11_IMDB.ipynb
-rw------- 1 root root    16364 Apr 28 04:33 12_IMDB2.ipynb
-rw------- 1 root root 39184702 Apr 28 02:40 imdb_cvect12.pkl
-rw------- 1 root root 44370573 Apr 28 02:40 imdb_nb2.pkl
-rw------- 1 root root 72446571 Apr 28 04:33 imdb_tvect_lr.pkl
drwx------ 2 root root     4096 Apr 28 01:28 test
drwx------ 3 root root     4096 Apr 28 01:28 train


In [23]:
best_pipe = joblib.load('imdb_tvect_lr.pkl')

In [28]:
review = '''
Spider-Man: No Way Home is a phenomenal conclusion to the trilogy and Holland's best outing as Spidey yet.
Starts off fun, safe and familiar and then becomes extremely emotional, satisfying and full of great callbacks.
A love letter to all things Spider-Man.
Tom Holland gives an incredible lead performance once again, reliably charming and likeable but with a lot more emotional heft this time around.
Zendaya and Jacob Batalon both have perfect chemistry with Holland and are also incredible in their own ways.
Benedict Cumberbatch reaffirms why he's such a good Doctor Strange with his excellent dry wit and gravitas.
All the returning villains give strong performances but Willem Dafoe is definitely the standout performance with an unsettling and terrifying presence.
Jon Watts' direction is superb, the action sequences are thrillingly acrobatic once again but refreshingly impactful this time around.
There's a few homages to the styles of the previous iterations and some gorgeous imagery.
It's also perfectly paced with none of its roughly 2hr 30 minute runtime feeling dull or overly long due to a strong momentum established early on.
The CG is extremely impressive with only a few weak spots.
The music by Michael Giacchino is fantastic, a nice balance of themes from the previous iterations and use of Holland's iconic motifs which are slightly altered in ways that work as well as a few new additions that are beautifully epic.
'''

In [29]:
import re
review = re.sub('[^a-zA-Z]', ' ', review).strip()

In [30]:
best_pipe.predict([review])

array([1])