## IMDB 영화 리뷰 감성 분석 - Conv1D

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
import warnings
warnings.filterwarnings('ignore')

### Conv1D로 IMDB 리뷰 감성 분석
- 단어 빈도수 : 10,000 (총 88,584)
- 문장의 단어수 : 500 (2,494)
- test data 중 40% (10,000개)는 검증용으로

In [2]:
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [5]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [6]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
X_train.shape, X_test.shape, y_train.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,), (25000,))

In [7]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_train.shape, X_test.shape

((25000, 500), (25000, 500))

In [8]:
from sklearn.model_selection import train_test_split
X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.4, random_state=seed
)
X_test.shape, X_valid.shape, y_test.shape, y_valid.shape

((15000, 500), (10000, 500), (15000,), (10000,))

Case 1) Conv1D * 2, MaxPooling1D * 2, Dropout, GlobalMaxPooling1D


> 
- embedding dim : 10
- hiddin_units : 128


In [16]:
model1 = Sequential([
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])
model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_3 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 99, 64)           0         
 1D)                                                             
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_3 (Dense)             (None, 1)                

In [17]:
model1.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path= 'best-conv1d.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [18]:
hist1 = model1.fit(
    X_train, y_train, epochs=30, batch_size=128,
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.43095, saving model to best-conv1d.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.43095 to 0.32052, saving model to best-conv1d.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.32052 to 0.31007, saving model to best-conv1d.h5
Epoch 4/30
Epoch 00004: val_loss improved from 0.31007 to 0.30777, saving model to best-conv1d.h5
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.30777
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.30777
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.30777
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.30777
Epoch 9/30
Epoch 00009: val_loss did not improve from 0.30777


In [19]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.3158935308456421, 0.8650000095367432]

- Case 2) Conv1D + LSTM

In [20]:
from tensorflow.keras.layers import LSTM

In [21]:
model2 = Sequential([
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    LSTM(100),
    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_4 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_4 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 99, 64)           0         
 1D)                                                             
                                                                 
 lstm_2 (LSTM)               (None, 100)               66000     
                                                                 
 dense_4 (Dense)             (None, 1)                 101       
                                                      

In [22]:
model2.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path= 'best-conv1d-lstm.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [23]:
hist2 = model2.fit(
    X_train, y_train, epochs=30, batch_size=128,
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.28424, saving model to best-conv1d-lstm.h5
Epoch 2/30
Epoch 00002: val_loss did not improve from 0.28424
Epoch 3/30
Epoch 00003: val_loss did not improve from 0.28424
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.28424
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.28424
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.28424


In [24]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.2829704284667969, 0.8838000297546387]

- Case3) Conv1D + Dense

In [26]:
model3 = Sequential([
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    GlobalMaxPooling1D(),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])
model3.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_6 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_6 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 124, 64)          0         
 1D)                                                             
                                                                 
 global_max_pooling1d_3 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_7 (Dense)             (None, 100)              

In [27]:
model3.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path= 'best-imdb-conv1d-fcn.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [28]:
hist3 = model3.fit(
    X_train, y_train, epochs=30, batch_size=128,
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.35774, saving model to best-imdb-conv1d-fcn.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.35774 to 0.27100, saving model to best-imdb-conv1d-fcn.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.27100 to 0.26787, saving model to best-imdb-conv1d-fcn.h5
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.26787
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.26787
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.26787
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.26787
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.26787


In [29]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.27016571164131165, 0.8908666372299194]