**Movie Sentiment Analysis using RNN**

In [1]:
# Import libraries
from keras.datasets import imdb
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [2]:
# IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
#Here we have set of 25,000 highly polar movie reviews for training and 25,000 for testing.
#So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
#only consider the top 10,000 most common words,

In [24]:
# Load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

In [25]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [26]:
# To get shape
print('No of samples in training set', X_train.shape)
print('No of samples in test set', X_test.shape)

No of samples in training set (25000,)
No of samples in test set (25000,)


In [27]:
y_train[0]

1

In [28]:
# To get maximum length of review
print(len(max((X_train+ X_test), key=len)))

2697


In [29]:
# To get minimum length of review
print(len(min((X_train+ X_test), key=len)))

70


In [30]:
# Reshape the y value
import numpy as np
y_train = np.asarray(y_train).reshape((-1,1))
y_test = np.asarray(y_test).reshape((-1,1))

In [31]:
# Apply the padding and decide max length
X_train = pad_sequences (X_train, padding='post', maxlen=100)
X_test = pad_sequences (X_test, padding='post', maxlen=100)

In [32]:
X_train.shape

(25000, 100)

In [33]:
# using RNN with Embedding Technique
#Word Embedding: Turns positive integers (indexes) into dense vectors of fixed size.
# A word embedding is a class of approaches for representing words and documents using a dense vector representation.
#It is an improvement over more the traditional bag-of-word, OHE model encoding schemes where large sparse vectors were used
# to represent each word or to score each word within a vector to represent an entire vocabulary.
#There are three parameters to the embedding layer
#input_dim : Size of the vocabulary
#output_dim : Length of the vector for each word
#input_length Maximum length of a sequence



model1 = Sequential()
model1.add(Embedding (10000, 32, input_length=100))  # Unique words (vocabulary size)= 10000, embedding_size=32, maximum length = max_words.
model1.add(SimpleRNN(32)) #return_sequences = False
model1.add(Dense (1, activation='sigmoid'))



In [34]:
# For RNN with embedding
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=5, batch_size= 256, validation_data=(X_test, y_test))

Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 81ms/step - accuracy: 0.5320 - loss: 0.6873 - val_accuracy: 0.7365 - val_loss: 0.5408
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 91ms/step - accuracy: 0.7864 - loss: 0.4716 - val_accuracy: 0.8356 - val_loss: 0.3999
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 75ms/step - accuracy: 0.8864 - loss: 0.2991 - val_accuracy: 0.8413 - val_loss: 0.3717
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 63ms/step - accuracy: 0.9356 - loss: 0.1848 - val_accuracy: 0.8349 - val_loss: 0.4027
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 71ms/step - accuracy: 0.9616 - loss: 0.1210 - val_accuracy: 0.8306 - val_loss: 0.4787


<keras.src.callbacks.history.History at 0x7f2de3ad6ec0>

In [36]:
# To get test accuracy
scores = model1.evaluate(X_test,y_test)
print('test accuracy', scores [1])

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8314 - loss: 0.4846
test accuracy 0.8306400179862976


In [38]:
# Prediction
y_pred = model1.predict(X_test)
y_pred

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step


array([[0.03859644],
       [0.96425533],
       [0.89469135],
       ...,
       [0.02489125],
       [0.01881581],
       [0.9839836 ]], dtype=float32)

In [39]:
# Covert probability to numbers
t1 = []
for i in range(len(y_pred)):
  if y_pred[i] >= 0.5:
    t1.append(1)
  else:
    t1.append(0)

In [40]:
print(t1)

[0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 

In [41]:
#Actual Values
y_test

array([[0],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])