In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

In [2]:
VOCAB_SIZE = 88548

MAXLEN = 250
BARCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=VOCAB_SIZE)     # loads as numpy ndarray

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
# train_data[0]

In [4]:
# More preprocessing
# our loaded reviews have different length and we can't pass different data lengths into neural networks, they all must have the same length (like matrixes), therefore we must make them the same length
train_data = sequence.pad_sequences(train_data, MAXLEN)     # keras function for making all data the same given length
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [5]:
# Creating the model
# word embedding layer as the first layer and add LSTM layer after to feed into a dense node to get predicted sentiment
# 32 stands for the output dimesion of the vectors generated by the embedding layer
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),      # vector outputs (words) ~ tensors ~ will have 32 dimensions
    tf.keras.layers.LSTM(32),                       # telling LSTM layer it will have 32 dimensions for every single work ~ tensor ~
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2833536   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,841,889
Trainable params: 2,841,889
Non-trainable params: 0
_________________________________________________________________


In [7]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8048389431234371804
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2912380519
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15718227859746146900
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [8]:
# Training
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=5, validation_split=0.2)    # use 20% of validation data to evaluate the model

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.340749055147171, 0.8707600235939026]


In [10]:
# Making predictions
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [11]:
print(word_index['amazing'])

477


In [12]:
# decode function
reverse_word_index = {value: key for key, value in word_index.items()}      # reverse a dict from str: int to int: str

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "

    return text[:-1]    # omit last element

print(decode_integers(encoded))

that movie was just amazing so amazing


In [25]:
# make a prediction
def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1, 250))        # create a blank numpy array full of zeroes in shape 1, 250 (1 array with 250 elements (zeroes))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])

positive_review = "That movie was so awesome! I really loved it and would watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)

[0.8710701]
[0.3082042]
