In [7]:
# %tensorflow_version 2.x
from keras.datasets import imdb 
from keras.preprocessing import sequence 
import tensorflow as tf
import os 
import numpy as np 

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=VOCAB_SIZE)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [10]:
for i in range(9):
    print(len(train_data[i]))

218
189
141
550
147
43
123
562
233


In [15]:
train_data = sequence.pad_sequences(train_data,MAXLEN)
test_data = sequence.pad_sequences(test_data,MAXLEN)
train_data[1]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     1,   194,
        1153,   194,  8255,    78,   228,     5,     6,  1463,  4369,
        5012,   134,    26,     4,   715,     8,   118,  1634,    14,
         394,    20,    13,   119,   954,   189,   102,     5,   207,
         110,  3103,    21,    14,    69,   188,     8,    30,    23,
           7,     4,   249,   126,    93,     4,   114,     9,  2300,
        1523,     5,   647,     4,   116,     9,    35,  8163,     4,
         229,     9,   340,  1322,     4,   118,     9,     4,   130,
        4901,    19,

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          2834688   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['acc'])

history = model.fit(train_data,train_labels,epochs=10,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
result = model.evaluate(test_data,test_labels)
print(result)



[0.45678569146513937, 0.85812]


In [21]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens],MAXLEN)[0]

text = 'that movie was just amazing, so amazing'
encode = encode_text(text)
print(encode)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0  

In [24]:
reverse_word_index = {value:key for (key,value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    return text[:-1]

print(decode_integers(encode))


that movie was just amazing so amazing


In [34]:
def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])
    
    
positive_review = "That movie was so awesome! I really loved it and would watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie sucked.I hated it and wouldn't watch it again was once of the worst things I've ever watched "
predict(negative_review)

[0.7597076]
[0.24080671]
