![Nuclio logo](https://nuclio.school/wp-content/uploads/2018/12/nucleoDS-newBlack.png)

In [1]:
from tensorflow import keras as ks
from tensorflow.keras.datasets import imdb
from matplotlib import pyplot as plt
import numpy as np

In [2]:
vocabulary_size = 10000

In [3]:
(x_train_original, y_train_original), (x_test_original, y_test_original) = imdb.load_data(num_words=vocabulary_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
print('Size train: ', x_train_original.size)
print('Size test: ', x_test_original.size)

Size train:  25000
Size test:  25000


In [5]:
data = np.concatenate((x_train_original, x_test_original), axis=0)
tags = np.concatenate((y_train_original, y_test_original), axis=0)

In [6]:
print(np.unique(tags))
print("Unique words:", len(np.unique(np.hstack(data))))

[0 1]
Unique words: 9998


In [7]:
length = [len(i) for i in data]
print("Mean length: ", np.mean(length))
print("Std dev; ", np.std(length))

Mean length:  234.75892
Std dev;  172.91149458735703


In [8]:
print("tag:", tags[0])
print("Encoded sentence:", data[0])

tag: 1
Encoded sentence: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [9]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join( [reverse_index.get(i - 3, "\n") for i in data[0]])
print(decoded)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json

 this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert 
 is an amazing actor and now the same being director 
 father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for 
 and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also 
 to the two little boy's that played the 
 of norman and paul they were just brilliant children are often left out of the 
 list i think because the stars that play them all grown up are such a big profile for th

In [10]:
from tensorflow.keras.preprocessing import sequence

In [11]:
data = sequence.pad_sequences(data, maxlen=vocabulary_size)
tags = np.array(tags).astype("float32")

In [12]:
length = [len(i) for i in data]
print("Mean length: ", np.mean(length))
print("Std dev; ", np.std(length))

Mean length:  10000.0
Std dev;  0.0


In [13]:
print(data[0])

[  0   0   0 ...  19 178  32]


In [14]:
x_val = data[-10000:]
y_val = tags[-10000:]
x_train = data[:-10000]
y_train = tags[:-10000]

In [15]:
len(x_train)

40000

In [16]:
model = ks.Sequential()

model.add(ks.layers.Embedding(vocabulary_size, output_dim=32, input_length = vocabulary_size))
model.add(ks.layers.LSTM(100))
model.add(ks.layers.Dense(32, activation = 'relu', kernel_initializer="he_uniform"))
model.add(ks.layers.Dropout(0.5))
model.add(ks.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10000, 32)         320000    
                                                                 
 lstm (LSTM)                 (None, 100)               53200     
                                                                 
 dense (Dense)               (None, 32)                3232      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 376,465
Trainable params: 376,465
Non-trainable params: 0
_________________________________________________________________


In [17]:
from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor="val_loss", patience=3)

In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
history = model.fit(x_train, y_train, epochs = 20, validation_data=(x_val, y_val), batch_size=128, callbacks=[callback])

Epoch 1/20
Epoch 2/20
 67/313 [=====>........................] - ETA: 5:36 - loss: 0.2565 - accuracy: 0.9057

KeyboardInterrupt: ignored