In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

In [2]:
# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
import imdb

In [4]:
imdb.data_dir = "/media/ruler/A44C-97D3/data/IMDB"

In [6]:
imdb.maybe_download_and_extract()

Data has apparently already been downloaded and unpacked.


In [7]:
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)

In [11]:
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  25000
Test-set size:   25000


In [12]:
data_text = x_train_text + x_test_text
print("all-dataset size: ", len(data_text))
# print("shape of dataset: ", type(data_text))

all-dataset size:  50000


In [13]:
x_train_text[11]


'Man oh man... I\'ve been foolishly procrastinating (not the right term, there\'s a long list!) to watch this film and finally had the chance to do so. And "news" are: Marvellous labyrinthine spectacle!<br /><br />For any Von Trier\'s "follower": both Rigets, Element of Crime, Dogville, Dancer in The Dark, The Five Obstructions, etc... Europa is probably the differential for its greatness in visual terms. Everything is beautifully somber and claustrophobic! You really get the feeling of being inside this "imaginary" nightmarish time warp. Taking from the masters of surreal cinema like Bunuel, Bergman, till noir films of the 40\'s with acidic drops of avant-guard Von Trier leads the art-film scene as the "well intended totalitarian" movie maker of nowadays. His authoritarian way of dealing with very intricate issues, without being irrational, hits the nerve of the viewer with the intent to cure some of the deepest wounds we feed in our hypocritical world.<br /><br />As Utopian as it see

In [14]:
y_train[11]

1.0

In [15]:
num_words = 10000

In [16]:
tokenizer = Tokenizer(num_words=num_words)

In [17]:
%%time
tokenizer.fit_on_texts(data_text)

CPU times: user 6.14 s, sys: 1.95 ms, total: 6.14 s
Wall time: 6.14 s


In [18]:
if num_words is None:
    num_words = len(tokenizer.word_index)

In [19]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [21]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
# x_train_tokens

In [22]:
x_train_text[1]

"Zentropa is the most original movie I've seen in years. If you like unique thrillers that are influenced by film noir, then this is just the right cure for all of those Hollywood summer blockbusters clogging the theaters these days. Von Trier's follow-ups like Breaking the Waves have gotten more acclaim, but this is really his best work. It is flashy without being distracting and offers the perfect combination of suspense and dark humor. It's too bad he decided handheld cameras were the wave of the future. It's hard to say who talked him away from the style he exhibits here, but it's everyone's loss that he went into his heavily theoretical dogma direction instead."

In [23]:
np.array(x_train_tokens[1])

array([   6,    1,   88,  212,   17,  198,  107,    8,  153,   43,   22,
         37,  959, 3184,   12,   23, 3873,   31,   19, 1494,   91,   11,
          6,   39,    1,  203, 3461,   15,   29,    4,  143,  369, 1495,
       8013,    1, 2194,  132,  483, 2944,  827, 1774,   37, 2168,    1,
       4805,   25, 1943,   51, 8927,   18,   11,    6,   62,   24,  116,
        158,    9,    6, 6126,  208,  109, 4133,    2, 1567,    1,  399,
       2174,    4,  757,    2,  457,  460,   44,   96,   74,   28,  877,
       4103,   70,    1, 3225,    4,    1,  731,   44,  249,    5,  131,
         35, 3353,   87,  243,   36,    1,  396,   28,  133,   18,   44,
       4161, 1919,   12,   28,  414,   82,   24, 2681, 9015,  459,  298])

In [24]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

In [26]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
# num_tokens

array([119, 110, 357, ..., 348, 396, 145])

In [27]:
np.mean(num_tokens)

221.27716

In [28]:
np.max(num_tokens)

2209

In [38]:
# truncating for too-long tokens

# setting the max_length of tokens to be the mean + 2 std. deviations
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens


544

In [39]:
# covers about 95% of the tokens
np.sum(num_tokens < max_tokens) / len(num_tokens) * 100

94.53

In [40]:
# padding for shorter tokens
pad = 'pre'

In [41]:
x_train_pad = pad_sequences(x_train_tokens, 
                            maxlen=max_tokens,
                            padding=pad, 
                            truncating=pad)

In [42]:
x_test_pad = pad_sequences(x_test_tokens, 
                           maxlen=max_tokens,
                           padding=pad, 
                           truncating=pad)

In [43]:
x_train_pad.shape

(25000, 544)

In [44]:
x_test_pad.shape

(25000, 544)

In [45]:
np.array(x_train_tokens[1])

array([   6,    1,   88,  212,   17,  198,  107,    8,  153,   43,   22,
         37,  959, 3184,   12,   23, 3873,   31,   19, 1494,   91,   11,
          6,   39,    1,  203, 3461,   15,   29,    4,  143,  369, 1495,
       8013,    1, 2194,  132,  483, 2944,  827, 1774,   37, 2168,    1,
       4805,   25, 1943,   51, 8927,   18,   11,    6,   62,   24,  116,
        158,    9,    6, 6126,  208,  109, 4133,    2, 1567,    1,  399,
       2174,    4,  757,    2,  457,  460,   44,   96,   74,   28,  877,
       4103,   70,    1, 3225,    4,    1,  731,   44,  249,    5,  131,
         35, 3353,   87,  243,   36,    1,  396,   28,  133,   18,   44,
       4161, 1919,   12,   28,  414,   82,   24, 2681, 9015,  459,  298])

In [46]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [48]:
# keras does not have the function to do do inverse mapping of index and words

idx = tokenizer.word_index
# print(idx)
inverse_map = dict(zip(idx.values(), idx.keys()))
inverse_map

{1: 'the',
 2: 'and',
 3: 'a',
 4: 'of',
 5: 'to',
 6: 'is',
 7: 'br',
 8: 'in',
 9: 'it',
 10: 'i',
 11: 'this',
 12: 'that',
 13: 'was',
 14: 'as',
 15: 'for',
 16: 'with',
 17: 'movie',
 18: 'but',
 19: 'film',
 20: 'on',
 21: 'not',
 22: 'you',
 23: 'are',
 24: 'his',
 25: 'have',
 26: 'be',
 27: 'one',
 28: 'he',
 29: 'all',
 30: 'at',
 31: 'by',
 32: 'an',
 33: 'they',
 34: 'so',
 35: 'who',
 36: 'from',
 37: 'like',
 38: 'or',
 39: 'just',
 40: 'her',
 41: 'out',
 42: 'about',
 43: 'if',
 44: "it's",
 45: 'has',
 46: 'there',
 47: 'some',
 48: 'what',
 49: 'good',
 50: 'when',
 51: 'more',
 52: 'very',
 53: 'up',
 54: 'no',
 55: 'time',
 56: 'my',
 57: 'even',
 58: 'would',
 59: 'she',
 60: 'which',
 61: 'only',
 62: 'really',
 63: 'see',
 64: 'story',
 65: 'their',
 66: 'had',
 67: 'can',
 68: 'me',
 69: 'well',
 70: 'were',
 71: 'than',
 72: 'much',
 73: 'we',
 74: 'bad',
 75: 'been',
 76: 'get',
 77: 'do',
 78: 'great',
 79: 'other',
 80: 'will',
 81: 'also',
 82: 'into',
 83

In [49]:
def tokens_to_string(tokens):
    # Map from tokens back to words(sentence) but from our dictionary this time.
    words = [inverse_map[token] for token in tokens if token != 0]
#     print(words)
    # Concatenate all words.
    text = " ".join(words)

    return text

In [50]:
x_train_text[1]

"Zentropa is the most original movie I've seen in years. If you like unique thrillers that are influenced by film noir, then this is just the right cure for all of those Hollywood summer blockbusters clogging the theaters these days. Von Trier's follow-ups like Breaking the Waves have gotten more acclaim, but this is really his best work. It is flashy without being distracting and offers the perfect combination of suspense and dark humor. It's too bad he decided handheld cameras were the wave of the future. It's hard to say who talked him away from the style he exhibits here, but it's everyone's loss that he went into his heavily theoretical dogma direction instead."

In [51]:
tokens_to_string(x_train_tokens[1])

"is the most original movie i've seen in years if you like unique thrillers that are influenced by film noir then this is just the right cure for all of those hollywood summer blockbusters the theaters these days von follow ups like breaking the waves have gotten more acclaim but this is really his best work it is flashy without being distracting and offers the perfect combination of suspense and dark humor it's too bad he decided cameras were the wave of the future it's hard to say who talked him away from the style he here but it's everyone's loss that he went into his heavily dogma direction instead"

# Creating the RNN 

In [65]:
model = Sequential() #sequential framework of keras

In [66]:
embedding_size = 8

In [67]:
# adding the embedding layer

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [68]:
model.add(GRU(units=16, return_sequences=True))

In [69]:
model.add(GRU(units=8, return_sequences=True))

In [70]:
model.add(GRU(units=4))

In [71]:
model.add(Dense(1, activation='sigmoid'))

In [72]:
optimizer = Adam(lr=1e-3)

In [73]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [74]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_3 (GRU)                  (None, 544, 16)           1200      
_________________________________________________________________
gru_4 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_5 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


# Training the RNN

In [75]:
%%time
model.fit(x_train_pad, y_train, 
          validation_split=0.05, 
          epochs=3, 
          batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 49min 15s, sys: 4min 17s, total: 53min 33s
Wall time: 26min 53s


<tensorflow.python.keras.callbacks.History at 0x7f22ec2e70f0>

# Performance on test set

In [76]:
%%time
result = model.evaluate(x_test_pad, y_test)

CPU times: user 11min 50s, sys: 51.5 s, total: 12min 42s
Wall time: 5min 36s


In [77]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 86.34%


# Misclassified  text example

In [78]:
%%time
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

CPU times: user 27.3 s, sys: 1.81 s, total: 29.1 s
Wall time: 14.5 s


In [79]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

In [80]:
cls_true = np.array(y_test[0:1000])

In [84]:
incorrect = np.where(cls_pred != cls_true)
# incorrect
incorrect = incorrect[0]
incorrect

array([  2,  11,  14,  16,  18,  46,  47,  67, 119, 154, 169, 171, 173,
       217, 219, 221, 222, 266, 272, 289, 297, 305, 346, 351, 355, 383,
       408, 428, 444, 449, 469, 479, 481, 484, 500, 511, 516, 522, 523,
       524, 532, 561, 574, 576, 584, 586, 619, 636, 654, 656, 661, 666,
       672, 674, 688, 691, 719, 721, 724, 727, 744, 763, 766, 783, 786,
       795, 812, 817, 835, 836, 837, 840, 841, 860, 861, 864, 871, 874,
       899, 907, 911, 920, 932, 934, 936, 943, 945, 952, 961, 974])

In [85]:
len(incorrect)

90

In [88]:
idx = incorrect[0]
idx

2

In [89]:
text = x_test_text[idx]
text

'This is my first Deepa Mehta film. I saw the film on TV in its Hindi version with its "Sita" character presented as Nita. I also note that it is Radha who underwent the allegorical trial by fire in the film and not Nita/Sita. Yet what I loved about the film was its screenplay by Ms Mehta, not her direction. The characters, big and small, were well-developed and seemed quixotic towards the end--somewhat like the end of Mazursky\'s "An Unmarried Woman." They are brave women surrounded by cardboard men. And one cardboard man (Ashok) seems to come alive in the last shot we see of him---carrying his invalid mother Biji. He seems to finally take on a future responsibility beyond celibacy and adherance to religion. <br /><br />Ms Mehta seems to fumble as a director (however, compared to most Indian mainstream cinema she would seem to be brilliant) as she cannot use her script to go beyond the microscopic joint family she is presenting except presenting a glimpse of the Chinese micro-minority

In [90]:
y_pred[idx]

0.20003621

In [91]:
y_train[idx]

1.0

# Testing the model on made up data :

In [None]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [93]:
tokens = tokenizer.texts_to_sequences(texts)

In [95]:
tokens_pad = pad_sequences(tokens, 
                           maxlen = max_tokens,
                           padding = pad, 
                           truncating = pad)
tokens_pad.shape


(8, 544)

In [98]:
y_custom_predict = model.predict(tokens_pad)
y_custom_predict

array([[0.90928835],
       [0.69783974],
       [0.27189565],
       [0.65990895],
       [0.12890694],
       [0.16621019],
       [0.3852682 ],
       [0.05609022]], dtype=float32)

In [100]:
cls_custom_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_custom_predict])
cls_custom_pred

array([1., 1., 0., 1., 0., 0., 0., 0.])

# Analysis of embeddings :

In [102]:
layer_embedding = model.get_layer("layer_embedding")

In [112]:
embedding_weights = layer_embedding.get_weights()[0]
# len(embedding_weights)
embedding_weights.shape

(10000, 8)

In [113]:
token_good = tokenizer.word_index['good']
token_good

49

In [114]:
token_great = tokenizer.word_index['great']
token_great

78

In [115]:
embedding_weights[token_good]

array([-0.02938217,  0.07468928,  0.04685117, -0.01904002, -0.04738949,
       -0.00919717, -0.1028757 ,  0.06023201], dtype=float32)

In [116]:
embedding_weights[token_great]

array([-0.0923152 ,  0.09625982,  0.16200343, -0.12702517, -0.12175523,
       -0.11960355, -0.14113873,  0.13574837], dtype=float32)

In [117]:
token_bad = tokenizer.word_index["bad"]
token_horrible = tokenizer.word_index["horrible"]

In [118]:
embedding_weights[token_bad]

array([ 0.10442476, -0.10574416, -0.12214937,  0.16064923,  0.13964306,
        0.02235492,  0.15346052, -0.13871054], dtype=float32)

In [119]:
embedding_weights[token_horrible]

array([ 0.08369003, -0.16367036, -0.2057716 ,  0.20550853,  0.16473329,
        0.0583082 ,  0.17677392, -0.21145464], dtype=float32)

In [122]:
def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = embedding_weights[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(embedding_weights, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

In [123]:
print_sorted_words('great', metric='cosine')

Distance from 'great':
0.000 - great
0.003 - christy
0.004 - uncompromising
0.005 - atmosphere
0.005 - fears
0.005 - beautifully
0.006 - servants
0.007 - wonderful
0.008 - liberty
0.009 - gently
...
1.986 - poor
1.986 - moronic
1.986 - pope
1.987 - pray
1.987 - baldwin
1.987 - lazy
1.988 - badness
1.990 - writes
1.993 - komodo
1.994 - retrieve


In [124]:
print_sorted_words('worst', metric='cosine')

Distance from 'worst':
0.000 - worst
0.004 - acquired
0.005 - dull
0.005 - mess
0.006 - forgettable
0.008 - imitation
0.008 - motive
0.009 - badly
0.009 - applies
0.009 - parking
...
1.989 - unravel
1.989 - horrors
1.990 - superbly
1.991 - seat
1.991 - wang
1.992 - marie
1.992 - delightful
1.993 - states
1.994 - devotion
1.996 - best
