In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.spatial.distance import cdist

  from ._conv import register_converters as _register_converters


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
import imdb

# Downloading Dataset

In [4]:
imdb.maybe_download_and_extract()

Data has apparently already been downloaded and unpacked.


In [5]:
x_train_text, y_train = imdb.load_data()
#x_test_text, y_test = imdb.load_data(train=False)


In [6]:
x_train_text[1]

"Panned by critics at the time but loved by the fans, this film has now become a classic. Mixing supposedly 'surreal' footage shot at John Lennon's home among other places with live footage of Marc Bolan & T.Rex at their very best, this film is not just a must for everyone who's liked Marc Bolan but gives a fascinating insight into the era.<br /><br />These were the times when Marc was hobnobbing with the likes of Ringo Starr of the Beatles [who directed it] and you can even find a brief spot from one Reg Dwight [Elton John to you] bashing the ivories in an amazing [and never officially released] version of Tutti Frutti and rocking and ballad versions of Children Of The Revolution.<br /><br />There's also wonderful scenes featuring Chelita Secunda [said to have 'created glam rock' with her use of glitter etc], Mickey Finn and even the actor from Catweazle!!<br /><br />The best scene for me is in the garden when Marc leaves the dining table, sits down cross-legged in front of a string s

In [7]:
y_train[1]

1.0

In [8]:
x_test_text, y_test = imdb.load_data(train=False)

In [9]:
x_test_text[0]

'Just saw Baby Blue Marine again after 30 years. I still find it a pleasant and romantic film which catches a time which has been lost forever. The innocence and purity of a time now long gone, is truthfully captured in this small film. The acting is above average and Richard Gere\'s brief appearance as a shell-shocked Raider Marine war hero, holds a keen interest for any film buff or Gere fan. Jan-Micheal Vincent is in his prime and looks and acts like the "All-American" boy. The late Bruno Kirby (who was billed as B.Kirby, Jr.) has a meaty role as "Pop", a peace-loving, Marine Corp reject, dreaming of getting back home to his wife. If you\'re looking for sex, drugs, or rock and roll, this movie is not for you. If you\'re looking for action and adventure, the same applies. However, if you want to recapture a time in America of innocence, honor, romance, and love, then Baby Blue Marine is a movie for you.'

In [10]:
y_test[0]

1.0

In [11]:
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [12]:
data_text = x_train_text + x_test_text

In [13]:
num_words = 10000

In [14]:
tokenizer = Tokenizer(num_words=num_words)

In [15]:
tokenizer.fit_on_texts(data_text)

In [16]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [17]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [18]:
x_train_text[0]

'I must admit, I was one of the skeptics who prematurely judged this show before relatively any information was disseminated about it. I determined that it was going to be a cheap spin-off guided by Ronald D. Moore wielding the retcon-wand.<br /><br />I was wrong!<br /><br />The pilot leaves an excellent impression upon the viewers. The accessibility is marvelous! Of course, seasoned BSG veterans will find themselves immersed in the plot, which is focused on the development of the Cylons before the first War. (58 years before the events of the BSG pilot). The pilot also allows for newcomers, clearly presenting its plot and ideas in the first part of the episode.<br /><br />Don\'t be mistaken: "Caprica" is not BSG. We are presented with an immersive, cerebral drama dotted by provocative, daring, and controversial ideas. <br /><br />The casting maintains BSG\'s standards; Stoltz and Morales are simply astounding. Morales\' portrayal of Joseph Adama, inspired by Olmos\' portrayal of Willi

In [19]:
np.array(x_train_tokens[0])

array([  10,  206,  986,   10,   13,   27,    4,    1,   35, 6418,   11,
        119,  159, 2406,   99, 1523,   13,   42,    9,   10, 2917,   12,
          9,   13,  166,    5,   26,    3,  691, 3138,  122, 9719,   31,
       5567, 1059, 2032, 7418,    1,    7,    7,   10,   13,  356,    7,
          7,    1, 1890,  885,   32,  320, 1380,  686,    1,  818,    1,
          6, 3118,    4,  265, 6813, 9720, 6686,   80,  165,  529,    8,
          1,  111,   60,    6, 2583,   20,    1,  979,    4,    1,  159,
          1,   86,  295,  153,  159,    1,  706,    4,    1, 9720, 1890,
          1, 1890,   81, 2217,   15,  700, 5370,   92,  111,    2, 1016,
          8,    1,   86,  173,    4,    1,  398,    7,    7,   89,   26,
       4148,    6,   21, 9720,   73,   23, 1381,   16,   32, 6856,  449,
         31, 5744, 3668,    2, 2924, 1016,    7,    7,    1, 1053, 7479,
       1609,    2,   23,  330, 5769, 1112,    4, 2446, 1603,   31, 1112,
          4, 1012,  411,    3,  393, 3303,    4, 36

In [20]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

In [21]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [22]:
np.mean(num_tokens)


221.27716

In [23]:
np.max(num_tokens)

2209

In [24]:
max_tokens = 500

In [25]:
pad = 'pre'

In [26]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [27]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [28]:
x_train_pad.shape

(25000, 500)

In [29]:
x_test_pad.shape


(25000, 500)

In [30]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# Making Model

In [31]:
model = Sequential()

In [32]:
embedding_size = 100

In [33]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [34]:
model.add(GRU(units=16, return_sequences=True))

In [35]:
model.add(GRU(units=8, return_sequences=True))

In [36]:
model.add(GRU(units=4))

In [37]:
model.add(Dense(1, activation='sigmoid'))

In [38]:
optimizer = Adam(lr=1e-3)

In [39]:
model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

In [40]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 500, 100)          1000000   
_________________________________________________________________
gru_1 (GRU)                  (None, 500, 16)           5616      
_________________________________________________________________
gru_2 (GRU)                  (None, 500, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 1,006,377
Trainable params: 1,006,377
Non-trainable params: 0
_________________________________________________________________


# Training and Saving

In [41]:
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc15c15a8d0>

In [42]:
model.save('sentiment.h5')

In [43]:
model.evaluate(x_test_pad,y_test)



[0.37395566933631896, 0.85036]