In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
import numpy as np


In [2]:
# https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data
NUM_WORDS = 20000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=NUM_WORDS)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
whos

Variable    Type       Data/Info
--------------------------------
NUM_WORDS   int        20000
imdb        module     <module 'keras.api._v2.ke<...>tasets/imdb/__init__.py'>
np          module     <module 'numpy' from '/us<...>kages/numpy/__init__.py'>
tf          module     <module 'tensorflow' from<...>/tensorflow/__init__.py'>
x_test      ndarray    25000: 25000 elems, type `object`, 200000 bytes (195.3125 kb)
x_train     ndarray    25000: 25000 elems, type `object`, 200000 bytes (195.3125 kb)
y_test      ndarray    25000: 25000 elems, type `int64`, 200000 bytes (195.3125 kb)
y_train     ndarray    25000: 25000 elems, type `int64`, 200000 bytes (195.3125 kb)


In [4]:
# x_train แต่ละอันคือ list
# ข้างใน list คือ index ของแต่ละคำใน review
print(type(x_train[0]))

# แต่ละ list ความยาวไม่เท่ากัน
print(len(x_train[0]))
print(len(x_train[1]))

<class 'list'>
218
189


In [5]:
# find the longest review
# ไม่ได้เอาความยาว max สุดไปใช้งานแค่แสดงผลเฉยๆ

max_length = 0
for i in range(x_train.shape[0]):
    if len(x_train[i]) > max_length:
        max_length = len(x_train[i])

print(max_length)

2494


In [6]:
# แปลงให้แต่ละอันความยาวเท่ากัน

MAX_LENGTH = 300

x_train_2D = np.zeros([25000,MAX_LENGTH],dtype=np.int32)
for i in range(x_train.shape[0]):
    if len(x_train[i]) >= MAX_LENGTH:
        x_train_2D[i,:] = x_train[i][0:MAX_LENGTH] # truncate
    else:
        x_train_2D[i,0:len(x_train[i])] = x_train[i]
        x_train_2D[i,len(x_train[i]):] = NUM_WORDS + 1 # padding

x_test_2D = np.zeros([25000,MAX_LENGTH],dtype=np.int32)
for i in range(x_test.shape[0]):
    if len(x_test[i]) >= MAX_LENGTH:
        x_test_2D[i,:] = x_test[i][0:MAX_LENGTH]
    else:
        x_test_2D[i,0:len(x_test[i])] = x_test[i]
        x_test_2D[i,len(x_test[i]):] = NUM_WORDS + 1

In [7]:
# y คือ 1 = positive, 0 = negative review
print(y_train)

[1 0 0 ... 0 1 0]


In [8]:
whos

Variable     Type       Data/Info
---------------------------------
MAX_LENGTH   int        300
NUM_WORDS    int        20000
i            int        24999
imdb         module     <module 'keras.api._v2.ke<...>tasets/imdb/__init__.py'>
max_length   int        2494
np           module     <module 'numpy' from '/us<...>kages/numpy/__init__.py'>
tf           module     <module 'tensorflow' from<...>/tensorflow/__init__.py'>
x_test       ndarray    25000: 25000 elems, type `object`, 200000 bytes (195.3125 kb)
x_test_2D    ndarray    25000x300: 7500000 elems, type `int32`, 30000000 bytes (28.6102294921875 Mb)
x_train      ndarray    25000: 25000 elems, type `object`, 200000 bytes (195.3125 kb)
x_train_2D   ndarray    25000x300: 7500000 elems, type `int32`, 30000000 bytes (28.6102294921875 Mb)
y_test       ndarray    25000: 25000 elems, type `int64`, 200000 bytes (195.3125 kb)
y_train      ndarray    25000: 25000 elems, type `int64`, 200000 bytes (195.3125 kb)


In [9]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, LSTM, Embedding, Input
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [10]:
input = Input((MAX_LENGTH,))
x = Embedding(NUM_WORDS+2, 400)(input)
x = LSTM(100, return_sequences=True)(x)
x = LSTM(100, return_sequences=False)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
out = Dense(2, activation='softmax')(x)

In [11]:
model = Model(input, out)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding (Embedding)       (None, 300, 400)          8000800   
                                                                 
 lstm (LSTM)                 (None, 300, 100)          200400    
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 128)               12928     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 258   

In [12]:
model.compile(
    optimizer='rmsprop',
    loss=SparseCategoricalCrossentropy(),
    metrics=['accuracy'])

In [13]:
model.fit(x_train_2D, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f84bf1f63d0>

In [14]:
model.evaluate(x_test_2D,  y_test, verbose=2)

782/782 - 8s - loss: 0.2842 - accuracy: 0.8792 - 8s/epoch - 11ms/step


[0.28416651487350464, 0.8792399764060974]