In [1]:
# RNN startting with sentiment analysis

In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE =  88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [2]:
len(train_data[1])

189

In [3]:
'''
we can't send different length reviews in our neural networks, therefore we use library called sequence from keras.preprocessing
to add padding of blank words (usually zero), to make each review the same length
'''

train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [4]:
train_data[1]
#len(train_data[1])

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     1,   194,
        1153,   194,  8255,    78,   228,     5,     6,  1463,  4369,
        5012,   134,    26,     4,   715,     8,   118,  1634,    14,
         394,    20,    13,   119,   954,   189,   102,     5,   207,
         110,  3103,    21,    14,    69,   188,     8,    30,    23,
           7,     4,   249,   126,    93,     4,   114,     9,  2300,
        1523,     5,   647,     4,   116,     9,    35,  8163,     4,
         229,     9,   340,  1322,     4,   118,     9,     4,   130,
        4901,    19,

In [5]:
'''
first layer will be embedding layer, even though this dataset is already pre-processed but we will add embedding layer to create
more meaningful relations (since we have done padding and introduced more zeroes), this layer would be helpful

second layer would be LSTM layer,  (long short term memory layer)

third layer would be output layer with sigmoid activation funciton
'''

model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE,32),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dense(1, activation='sigmoid')
])

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2843041 (10.85 MB)
Trainable params: 2843041 (10.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
model.compile(loss = 'binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
history = model.fit(train_data, train_labels, epochs = 5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
results = model.evaluate(test_data,test_labels)
print(results)

[0.37368038296699524, 0.8712400197982788]


In [24]:
'''
To make future predicitons on the new reviews, we also need to encode new reviews in the same format as we encoded the
training data
'''

word_index = imdb.get_word_index()

def encode_text(text):
  tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens],MAXLEN)[0]


text = 'that movie was just amazing, so beautiful'
encoded = encode_text(text)
print(encoded)


[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [25]:
'''
we will make the decoding fucntion as well, so to decode the text we ahve received in the string
'''

reverse_word_index = {value: key for (key,value) in word_index.items()}

def decode_integers(integers):
  PAD = 0
  text = ''
  for num in integers:
    if num!=PAD:
      text += reverse_word_index[num]+' '

  return text[:-1]


print(decode_integers(encoded))


that movie was just amazing so beautiful


In [40]:
'''
Now we create the function to make prediction on any text we want
'''

def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1,250))
  pred[0] = encoded_text
  result = model.predict(pred)
  print(result[0])

positive_review = 'that movie was just amazing, so beautiful,so great, very good, nice acting, award'
predict(positive_review)

negative_review = 'bad movie,cringe,worst'
predict(negative_review)

[0.8152614]
[0.4136334]


In [None]:
'''
higher the number the better the review
lower the number the worse the review
'''