Importing required modules 

In [1]:
import numpy as np
import pandas as pd
from string import punctuation
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import cv2
import os

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

*Importing the dataframe*

In [3]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB/dataframe_train.csv')

In [4]:
#Visualizing the dataset
df

Unnamed: 0,file,review,label
0,127_7.txt,Zentropa has much in common with The Third Man...,pos
1,126_10.txt,Zentropa is the most original movie I've seen ...,pos
2,125_7.txt,Lars Von Trier is never backward in trying out...,pos
3,124_10.txt,*Contains spoilers due to me having to describ...,pos
4,123_10.txt,That was the first thing that sprang to mind a...,pos
...,...,...,...
24995,12420_3.txt,There just isn't enough here. There a few funn...,neg
24996,12419_1.txt,Tainted look at kibbutz life<br /><br />This f...,neg
24997,12418_4.txt,"I saw this movie, just now, not when it was re...",neg
24998,12417_1.txt,Any film which begins with a cowhand shagging ...,neg


Pre-processing on the dataset

In [5]:
for i in range(len(df)):
  text=df['review'][i].lower()
  text=text.replace("<br /><br />",'')
  clean_txt=''.join([c for c in text if c not in punctuation])
  df['review'][i]=clean_txt

In [6]:
english_stops = set(stopwords.words('english'))
x_data = df['review']
list_rev = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])

In [7]:
list_rev=np.asarray(list_rev)

In [8]:
#Encoding the labels
for i in range(len(df)):
  if df['label'][i]== 'pos':
    df['label'][i]=1
  elif df['label'][i]=='neg':
    df['label'][i]=0

In [9]:
encoded_labels=np.asarray(df['label']).astype('float32')

Splitting the Dataset into test(80%), train(10%) and validation(10%) set 

In [10]:
x_train, x_tes, y_train, y_tes = train_test_split(list_rev, encoded_labels, test_size = 0.2)
x_test, x_valid, y_test, y_valid = train_test_split(x_tes, y_tes, test_size = 0.5)

In [11]:
#Function to get maximum length of dataset
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [12]:
# ENCODING REVIEW by Padding the x_train,x_test and x_valid to fed into the LSTM
token = Tokenizer(lower=False)  
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)
x_valid = token.texts_to_sequences(x_valid)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')
x_valid = pad_sequences(x_valid, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 1688  3367     1 ...    41    55   415]
 [    1 51089    83 ...     0     0     0]
 [  311     8   101 ...     0     0     0]
 ...
 [  422   155  1599 ...     0     0     0]
 [  110    30    24 ...     5    10   151]
 [  101   211  4802 ...     0     0     0]] 

Encoded X Test
 [[  1537   2667 110658 ...  26628    833      9]
 [  6132      1   1086 ...   7246    712   8867]
 [ 77895  13258  16039 ...      0      0      0]
 ...
 [    47     19    177 ...      0      0      0]
 [     3    403     24 ...      0      0      0]
 [   329   3819     69 ...      0      0      0]] 

Maximum review length:  122


Creating Model Architecture

In [13]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 122, 32)           3981248   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 4,006,145
Trainable params: 4,006,145
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [15]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5,validation_data=(x_test,y_test), callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.54685, saving model to models/LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.54685 to 0.86280, saving model to models/LSTM.h5
Epoch 3/5

Epoch 00003: accuracy improved from 0.86280 to 0.95770, saving model to models/LSTM.h5
Epoch 4/5

Epoch 00004: accuracy improved from 0.95770 to 0.98560, saving model to models/LSTM.h5
Epoch 5/5

Epoch 00005: accuracy improved from 0.98560 to 0.99410, saving model to models/LSTM.h5


<keras.callbacks.History at 0x7efd291e2910>

**Getting prediction on validation Dataset**

In [18]:
y_pred1 = model.predict(x_valid, batch_size = 128)
y_pred=[int(num>0.5) for num in y_pred1]
true = 0
for i, y in enumerate(y_valid):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 2152
Wrong Prediction: 348
Accuracy: 86.08
