<a href="https://colab.research.google.com/github/safal25/ml_basic_codes/blob/main/IMDB_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing the necessary libraries
import tensorflow as tf

In [2]:
import tensorflow.keras as keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
imdb = keras.datasets.imdb

In [4]:
#downloading the dataset
#splitting the data into train and test data
vocab_size=10000
(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [5]:
#the dataset is already integer encoded so we need to download the mapping of words to integers
word_index=imdb.get_word_index()

word_index={k:(v+3) for k,v in word_index.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [6]:
word_index["<PAD>"]=0
word_index["<START"]=1
word_index["<UNK>"]=2
word_index["<UNUSED>"]=3

In [7]:
#checking the mapping
s=["the","movie","was","beautiful"]
arr=[word_index[k] for k in s]

In [8]:
arr

[4, 20, 16, 307]

In [None]:
#creating another mapping from integers to words
reverse_word_index=dict([(value,key) for key,value in word_index.items()])

In [None]:
#as mentioned above the dataset has already been integer encoded
#The decode_review function just helps us to decode an integer encoded review back to the text format
def decode_review(text):

  return " ".join([reverse_word_index.get(i,'?') for i in text])

review=decode_review(train_data[0])

In [None]:
#printing the first review in training dataset
for i in review:
  print(i,end='')

<START this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what

In [10]:
#length of each review should be equal for the model to be working correctly
#We have chosen the length of each review to be 500
#If the review is longer than 500 words we are going to cut the extra part and if it has less number of words
#we are going to pad the review with zeros to make its length 500
train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding='post',maxlen=500)
test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding='post',maxlen=500)

In [9]:
#building the model
#It consits of one word embedding layer which creates word embeddings of length 16 from integer encoded review
#second layer is global average pooling layer
#then a dense layer which has 16 hidden units and uses relu as activation function
#and the final output layer which uses sigmoid as activation function

model=keras.Sequential([keras.layers.Embedding(vocab_size,16,input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16,activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

In [11]:
#compiling the model
#adam is used as optimization function for the above model
#binary cross entropy loss function is used as loss function
#accuracy is used as the metric to evaluate the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [12]:
#training the model
history=model.fit(train_data,train_labels,epochs=30,batch_size=512,validation_data=(test_data,test_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [13]:
#evaluating the loss and accuracy on testing data
#the model shows an accuracy of 88.48% on test data
loss,accuracy=model.evaluate(test_data,test_labels)



In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [14]:
#copying a random review from imdb to test our model
string="Scam 1992 The Harshad Mehta story is a brilliant web series directed by Hansal Mehta I have been a Hansal Mehta fan since Bose and Omerta His direction is mindblowing Performance by Pratik Gandhi Shreya Dhanwanthary and others are good. In short a definite watch"


In [15]:
#converting the string into a list of strings 
arr=string.split()

In [18]:
#since this review is in text format
#we need to convert it in integer encoded format before giving it as an input to the model
#review encoder function encodes the review
def review_encoder(text):
  arr=[word_index.get(word,0) for word in text]
  return arr

scam_review=review_encoder(arr)
for i in range(len(scam_review)):
  if(scam_review[i]>10000):
    scam_review[i]=3

In [21]:
#converting the list to a numpy array
scam_review=np.array([scam_review])

In [22]:
#padding the review
scam_review=keras.preprocessing.sequence.pad_sequences(scam_review,value=word_index["<PAD>"],padding='post',maxlen=500)

In [27]:
#checking the prediction of our model on the random review
# model.predict_classes(scam_review)
(model.predict(scam_review) > 0.5).astype("int32")

array([[1]], dtype=int32)