<font color="darkred">

# Example 2-1: Classifying movie review

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import plot_model
from keras import backend as K
from keras.callbacks import EarlyStopping, TensorBoard
from keras import optimizers

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image

Using TensorFlow backend.


### The IMDB dataset


- A set of 50,000 highly-polarized reviews from the Internet Movie Database
    - 25,000 reviews for training and 25,000 reviews for testing
    - 50% negative(label=0) and 50% positive(label=1) reviews



In [2]:
np.load__defaults__=(None, True, True, 'ASCII')

### Loading data

In [3]:
from keras.datasets import imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)


- `num_words=10000`: keep the top 10,000 most frequently occurring words in the training data


In [4]:
len(train_data)

25000

In [5]:
train_data[1]

[1,
 194,
 1153,
 194,
 8255,
 78,
 228,
 5,
 6,
 1463,
 4369,
 5012,
 134,
 26,
 4,
 715,
 8,
 118,
 1634,
 14,
 394,
 20,
 13,
 119,
 954,
 189,
 102,
 5,
 207,
 110,
 3103,
 21,
 14,
 69,
 188,
 8,
 30,
 23,
 7,
 4,
 249,
 126,
 93,
 4,
 114,
 9,
 2300,
 1523,
 5,
 647,
 4,
 116,
 9,
 35,
 8163,
 4,
 229,
 9,
 340,
 1322,
 4,
 118,
 9,
 4,
 130,
 4901,
 19,
 4,
 1002,
 5,
 89,
 29,
 952,
 46,
 37,
 4,
 455,
 9,
 45,
 43,
 38,
 1543,
 1905,
 398,
 4,
 1649,
 26,
 6853,
 5,
 163,
 11,
 3215,
 2,
 4,
 1153,
 9,
 194,
 775,
 7,
 8255,
 2,
 349,
 2637,
 148,
 605,
 2,
 8003,
 15,
 123,
 125,
 68,
 2,
 6853,
 15,
 349,
 165,
 4362,
 98,
 5,
 4,
 228,
 9,
 43,
 2,
 1157,
 15,
 299,
 120,
 5,
 120,
 174,
 11,
 220,
 175,
 136,
 50,
 9,
 4373,
 228,
 8255,
 5,
 2,
 656,
 245,
 2350,
 5,
 4,
 9837,
 131,
 152,
 491,
 18,
 2,
 32,
 7464,
 1212,
 14,
 9,
 6,
 371,
 78,
 22,
 625,
 64,
 1382,
 9,
 8,
 168,
 145,
 23,
 4,
 1690,
 15,
 16,
 4,
 1355,
 5,
 28,
 6,
 52,
 154,
 462,
 33,
 89,
 78,
 2

- 각 리뷰를 구성하는 단어 인덱스 리스트 

In [6]:
train_labels[0]

1

- 각 리뷰를 긍정=1 또는 부정=0 으로 나타내는 리스트 

In [7]:
len(train_data)

25000

In [8]:
max([max(sequence) for sequence in train_data])

9999

- 가장 자주 등장하는 단어 10000개로 제한했기 때문에 단어 인덱스는 9999가 최대값 

In [9]:
# word_index is a dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

- `word_index`: 단어를 key로, 정수 인덱스를 value로 매칭하는 딕셔너리 
- `reverse_word_index`: 정수 인덱스를 key로, 단어를 value로 매칭하는 딕셔너리 

In [10]:
word_index['the']

1

In [11]:
reverse_word_index[1]

'the'

In [12]:
reverse_word_index.get()

TypeError: get expected at least 1 arguments, got 0

In [13]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [14]:
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([reverse_word_index.get(i-3 , '?') for i in train_data[0]])

- 0번째 리뷰를 문장으로 복원 
- 0="padding", 1="start of sequence", 2="unknown"을 위한 인덱스로 사용하고 있으므로 3을 빼서 단어로 복원 
- 참고: https://keras.io/datasets/

In [15]:
decoded_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you th

### Preparing the data
- 한 문장을 $10000\times 1$ vector로 변환
    - 문장이 포함하고 있는 단어 자리는 1, 포함하지 않은 단어 자리는 0으로 채워진 벡터로 변환
    

In [16]:
def vectorize_sequences(sequences, dimension=10000):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.  # set specific indices of results[i] to 1s
    return results

In [17]:
# Our vectorized training data
x_train = vectorize_sequences(train_data)
# Our vectorized test data
x_test = vectorize_sequences(test_data)

In [18]:
x_train[0]

array([0., 1., 1., ..., 0., 0., 0.])

In [19]:
x_train.shape

(25000, 10000)

In [20]:
# Our vectorized labels
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [21]:
y_train

array([1., 0., 0., ..., 0., 1., 0.], dtype=float32)

In [22]:
np.shape(x_train), np.shape(y_train)

((25000, 10000), (25000,))

### Building our network




![3-layer network](https://s3.amazonaws.com/book.keras.io/img/ch3/3_layer_network.png)

In [23]:
K.clear_session() 

model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(10000,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


- Input은 10000개 단어에 대한 one-hot vector로 표현된 단어들 
- 16개 hidden unit이 있는 layer 2개로 구성 
- 각 layer에서 activation funtion $\sigma(\cdot)$을 ReLU 함수를 사용 

<img src="figures/relu.PNG" width=350>
- Output은 긍정의 확률을 나타내는 하나의 숫자. Sigmoid 함수 사용 
<img src="figures/sigmoid.PNG" width=350>


In [24]:
model.compile(optimizer=optimizers.adam(lr=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



- Loss function:`binary_crossentropy` for binary classification 
- Optimizer:  `rmsprop` 

### Validating our approach

Create a "validation set" by setting apart 10,000 samples from the original training data

In [25]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [26]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))


Train on 15000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


- `batch_size=512`: 512개의 샘플씩 미니 배치를 만들어 gradient를 업데이트 
- `epochs=20`: 모든 샘플에 대해 20번 반복 
- `validation_data=(x_val, y_val)`: 주어진 validation set에 대해 검증 데이터 전달 

In [27]:
history_dict = history.history
history_dict.keys()

dict_keys(['val_loss', 'val_accuracy', 'loss', 'accuracy'])

- `model.fit`은 history 객체를 반환. 훈련동안의 모든 정보를 담고 있는 딕셔너리 

In [28]:

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

KeyError: 'acc'

- 학습이 진행되면서 training loss는 점차 감소 
- 학습이 진행되면서 validation loss는 감소하다가 4번째 epoch 이후 증가
    - training data에 대해서는 잘 작동하지만 처음 보는 데이터에는 잘 작동하지 않음: overfitting 

In [29]:

plt.clf()   # clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

KeyError: 'acc'

<Figure size 432x288 with 0 Axes>

## Using a trained network to generate predictions on new data


- Overfitting이 발생하기 직전까지 4번의 epoch만 적합시킨 모형으로 test set을 예측하는데 사용 

In [30]:
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(10000,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.adam(lr=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [31]:
results

[0.32488986546039583, 0.8766000270843506]

In [32]:
predictions = model.predict(x_test)

- `x_test`에 대해 모형을 통해 예측한 값을 저장 

In [33]:
predictions[0]

array([0.11593497], dtype=float32)

<font color="darkgreen">

__NOTE__
- 위의 예제에서는 20번의 epoch를 먼저 학습시켜 overfitting 여부를 체크한 후 다시 한번 모형을 적절한 epoch 만큼 학습시키는 과정을 진행했음
- 하지만 학습시간이 오래걸리는 문제에서는 이 방식이 비효율적임
- tensorboard를 활용한다면 학습하면서 동시에 overfitting을 체크하며 model tuning 과정을 보다 효율적으로 진행할 수 있음


References
- [Deep Learning with Python, François Chollet,](https://www.manning.com/books/deep-learning-with-python)