In [13]:
#importing libraries

import re
import pandas as pd
import numpy as np
import nltk
import math
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [14]:
# load the dataset in pandas data frame

df = pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [15]:
def remove_tags(string):
    removelist = ""
    result = re.sub('','',string)#remove HTML tags
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub(r'[^\w'+removelist+']', ' ',result)    #remove non-alphanumeric characters
    result = result.lower()
    return result

df['review']=df['review'].apply(lambda x : x.replace(x,remove_tags(x)))

In [16]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic di...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i m going to have to disagree with the previou...,negative


In [17]:
# removing the stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
# performing lemmatization
#obtain root forms of words
nltk.download('wordnet')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
df['review'] = df.review.apply(lemmatize_text)
df

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode h...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
...,...,...
49995,thought movie right good job creative original...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,going disagree previous comment side maltin on...,negative


In [22]:
# checking if it is a balanced dataset or not

s = 0.0
for i in df['review']:
    w_list = i.split()
    s += len(w_list)

print("Avg length of each review: ",s/df.shape[0])

Avg length of each review:  123.62852


In [23]:
pos = 0
for i in range(df.shape[0]):
    if df.iloc[i]['sentiment']=='positive':
        pos+=1
neg = df.shape[0]-pos
print("positive sentiment percentage: ",pos/df.shape[0]*100)
print("positive sentiment percentage: ",neg/df.shape[0]*100)

positive sentiment percentage:  50.0
positive sentiment percentage:  50.0


In [26]:
#encoding labels and making train-Test Splits

# converting positive and negative to 0s and 1s using LabelEncoder()
reviews = df['review'].values
labels = df['sentiment'].values
encoder = LabelEncoder()
encoded = encoder.fit_transform(labels)

In [28]:
# split the dataset into test and train datasets
X_train, X_test, y_train,y_test = train_test_split(reviews,encoded,stratify = encoded)

In [29]:
'''
    before feeding the data into LSTM model
    it needs to be tokenized and padded
'''

'\n    before feeding the data into LSTM model\n    it needs to be tokenized and padded\n'

In [30]:
# hyperparameters
vocab_size = 3000
oov_tok = ''
embedding_dim = 100

max_length = 200
padding_type = 'post'
trunc_type = 'post'

#tokenizing sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

#convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences,padding = 'post',maxlen = max_length)

#convert Test Dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences,padding = 'post',maxlen = max_length)

In [32]:
#Building the model

model = keras.Sequential([
    keras.layers.Embedding(vocab_size,embedding_dim,input_length = max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])


#compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          300000    
                                                                 
 bidirectional (Bidirection  (None, 128)               84480     
 al)                                                             
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387601 (1.48 MB)
Trainable params: 387601 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
# model training and evaluation

num_epochs = 5
history = model.fit(train_padded,y_train,
                    epochs = num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
prediction = model.predict(test_padded)

#get labels based on probablity 1 if p>=0.5 else 0
pred_label=[]
for i in prediction:
    if i>=0.5:
        pred_label.append(1)
    else:
        pred_label.append(0)
print("Accuray of prediction on test set: ",accuracy_score(y_test,pred_label))

Accuray of prediction on test set:  0.86512


### Using the model

In [39]:
'''
    using model to determine the sentiment
    of unseen movie reviews
'''

sentence = ["The movie was very touching and heart whelming",
            "I have never seen a terrible movie like this",
            "the movie plot is sooo terible but it had good acting"]
#convert to sequence
sequences = tokenizer.texts_to_sequences(sentence)

#pad the sequence
padded = pad_sequences(sequences,padding='post',maxlen=max_length)

# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model.predict(padded)
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Positive'
    else:
        s = 'Negative'
    print("Predicted sentiment : ",s)

The movie was very touching and heart whelming
Predicted sentiment :  Positive
I have never seen a terrible movie like this
Predicted sentiment :  Negative
the movie plot is sooo terible but it had good acting
Predicted sentiment :  Negative
