## Tensorflow + RNN 을 활용한 영화 리뷰 감성 분석


#### (1) 전처리 된 데이터 로드

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pickle

with open("/content/drive/My Drive/Colab Notebooks/preprocessed_data.pkl", "rb") as f:
  saved_data = pickle.load(f)
  
word2idx = saved_data["word2idx"]
embedding_matrix = saved_data["embedding_matrix"]

train_sents = saved_data["train_sents"]
test_sents = saved_data["test_sents"]

train_labels = saved_data["train_labels"]
test_labels = saved_data["test_labels"]

#### (2) 인풋 데이터 형태 만들기

In [3]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
import numpy as np

tf.set_random_seed(1109)

In [0]:
# train_data 에서 랜덤하게 10% 를 검증 데이터로 구성
 train_sents, val_sents, train_labels, val_labels = train_test_split(train_sents, train_labels, test_size=0.1, random_state=1109)

In [5]:
len(train_sents)

134995

In [6]:
len(val_sents)

15000

In [7]:
len(test_sents)

49997

In [0]:
# train_sents, val_sents 를 word2idx 를 사용해서 각 단어별 index 값을 가지는 numpy array 형태로 구축
# 전체 데이터에 대해서 train_data 의 max_length 기준으로 padding 진행

max_length = max([len(sent) for sent in train_sents])
train_seqs = []
val_seqs = []

for sent in train_sents:
  tmp = np.zeros(max_length, dtype="int32")
  for i, word in enumerate(sent):
    idx = word2idx.get(word)
    if idx != None:
      tmp[i] = idx
    else:
      tmp[i] = word2idx.get("<UNK>") # out of vocab word 처리
  train_seqs.append(tmp)

for sent in val_sents:
  tmp = np.zeros(max_length, dtype="int32")
  for i, word in enumerate(sent):
    idx = word2idx.get(word)
    if idx != None:
      tmp[i] = idx
    else:
      tmp[i] = word2idx.get("<UNK>")
  val_seqs.append(tmp)

train_inputs = np.stack(train_seqs)
val_inputs = np.stack(val_seqs)
train_targets = np.array(train_labels, dtype="int32")
val_targets = np.array(val_labels, dtype="int32")

In [9]:
train_inputs.shape

(134995, 116)

In [10]:
val_inputs.shape

(15000, 116)

In [11]:
train_inputs[0]

array([ 423,  256, 4860,   17,  312,  989,   37,  157,   47, 1107,   97,
         58,   59, 1966,   37,    5,   13,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [12]:
train_targets

array([1, 1, 1, ..., 0, 0, 1], dtype=int32)

#### (3) 텐서플로우 모델 구축

#### -- (i) 하이퍼파라미터 설정

In [0]:
epochs = 30
batch_size = 512
learning_rate = 0.01
vocab_size = len(word2idx)
embedding_size = 100
hidden_size = 64
max_length = train_inputs.shape[1]

#### -- (ii) 모델 정의

In [14]:
model = tf.keras.Sequential()
model.add(layers.Embedding(input_dim = vocab_size,
                           output_dim = embedding_size,
                           embeddings_initializer = Constant(embedding_matrix),
                           trainable = False))

# stacked RNN 사용시, 가장 마지막 layer 를 제외하고는 return sequences 를 입력해주어야 함
model.add(layers.Bidirectional(layers.CuDNNLSTM(units=hidden_size, return_sequences=True, input_shape=(max_length, embedding_size))))
model.add(layers.Bidirectional(layers.CuDNNLSTM(units=hidden_size)))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         6125100   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         84992     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               99328     
_________________________________________________________________
dense (Dense)    

#### -- (iii) 모델 학습 및 저장

In [15]:
# 앞선 실험결과보다 전반적으로 성능은 높지만 오버피팅이 발생하는 상태
adam = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=adam,
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.fit(train_inputs, train_targets, epochs=epochs,
          batch_size=batch_size, validation_data=(val_inputs, val_targets))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 134995 samples, validate on 15000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f3abe993fd0>