In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
#importing IMDB dataset
import imdb

In [3]:
x_train, y_train = imdb.load_data(train=True)
x_test, y_test = imdb.load_data(train=False)

In [4]:
total_words=[]
for i in x_test+x_train:
    total_words.append(i)
num_words=list(set(total_words))

In [7]:
print(len(num_words))  #Total Words

49582


In [8]:
x_train[1]  #2nd review of the training set

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [9]:
y_train[1] #Its 1.0 that means positive sentiment

1.0

In [10]:
#Keras has a tokenizer function that converts words into string tokens
#num_words is the parameter used to describe the number of most frequent words
#num_words=None means it tokenizes the entire vocabulary
tokenizer=Tokenizer(num_words=None)
tokenizer.fit_on_texts(x_train+x_test) #Fitting the tokenizer for entire dataset


In [11]:
tokenizer.word_index  #This describes the token given to each word in the vocabulary which is 49582 as calculated in the 7th cell

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [15]:
#Now we shall convert our training texts and test texts to token sequence 
x_train_token=tokenizer.texts_to_sequences(x_train)
x_test_token=tokenizer.texts_to_sequences(x_test)

In [20]:
#Next step is padding and truncating
#Since we need the input of the neural network to be of same dimension
#Padding is done for shorter sequences
#Truncating is done for longer sequences
max_tokens=600
x_train_pad = pad_sequences(x_train_token, maxlen=max_tokens,padding='pre', truncating='pre')
x_test_pad = pad_sequences(x_test_token, maxlen=max_tokens,padding='pre', truncating='pre')

# Neural Network Architecture

In [19]:
model=Sequential()
model.add(Embedding(input_dim=len(num_words),output_dim=100,input_length=max_tokens,name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=16))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=Adam(1e-3),metrics=['accuracy'])

In [21]:
#Now lets train the model
model.fit(x_train_pad,y_train,validation_split=0.05,epochs=10,batch_size=100)
#Performance
model.evaluate(x_test_pad, y_test)

Train on 23750 samples, validate on 1250 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.65924870053350926, 0.85143999999999997]

In [22]:
test="Very awesome movie I would like to watch it again"
test=[test]
test_token=tokenizer.texts_to_sequences(test)
tokens_pad = pad_sequences(test_token, maxlen=max_tokens,padding='pre', truncating='pre')
model.predict(tokens_pad)


array([[ 0.99718189]], dtype=float32)

In [23]:
#This means that this review is 99% positive