In [None]:
import tensorflow as tf
import pandas as pd
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

**LOADING DATASET**

In [None]:
dataset=pd.read_csv("drive/MyDrive/imdb/IMDB Dataset.csv")

In [None]:
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
def f(row):
    if row['sentiment'] == 'positive':
        val = 1
    else:
        val = 0
    return val

In [None]:
dataset['Y'] = dataset.apply(f, axis=1)

In [None]:
dataset

Unnamed: 0,review,sentiment,Y
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


**PREPROCESSING**

In [None]:
%%time
import re
#Removes Punctuations
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

#Removes HTML syntaxes
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#Removes URL data
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#Removes Emojis
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

dataset['review']=dataset['review'].apply(lambda z: remove_punctuations(z))

dataset['review']=dataset['review'].apply(lambda z: remove_html(z))
dataset['review']=dataset['review'].apply(lambda z: remove_url(z))
dataset['review']=dataset['review'].apply(lambda z: remove_emoji(z))

CPU times: user 4.98 s, sys: 10.3 ms, total: 4.99 s
Wall time: 5.01 s


In [None]:
dataset

Unnamed: 0,review,sentiment,Y
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production br br The filmin...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically theres a family where a little boy J...,negative,0
4,Petter Matteis Love in the Time of Money is a ...,positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,Bad plot bad dialogue bad acting idiotic direc...,negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,Im going to have to disagree with the previous...,negative,0


In [None]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer


def lemma_traincorpus(data):
    lemmatizer=WordNetLemmatizer()
    out_data=""
    for words in data:
        out_data+= lemmatizer.lemmatize(words)
    return out_data

dataset['review']=dataset['review'].apply(lambda z: lemma_traincorpus(z))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
dataset

Unnamed: 0,review,sentiment,Y
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production br br The filmin...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically theres a family where a little boy J...,negative,0
4,Petter Matteis Love in the Time of Money is a ...,positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,Bad plot bad dialogue bad acting idiotic direc...,negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,Im going to have to disagree with the previous...,negative,0


In [None]:
seed = 0

import random
import numpy as np
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
sampled_dataset=dataset.sample(frac=0.3, random_state=seed).reset_index(drop=True)

In [None]:
sampled_dataset

Unnamed: 0,review,sentiment,Y
0,John Cassavetes is on the run from the law He ...,positive,1
1,Its not just that the movie is lame Its more t...,negative,0
2,Well if it werent for Ethel Waters and a 7year...,negative,0
3,I find Alan Jacobs review very accurate concer...,positive,1
4,This movie is simply awesome It is so hilariou...,positive,1
...,...,...,...
14995,The most intense and powerful film I have seen...,positive,1
14996,Hello it is I Derrick Cannon and I welcome you...,negative,0
14997,I was surprised as I watched this movie how mu...,positive,1
14998,This movie has its ups and downs but to me the...,positive,1


In [None]:
X=sampled_dataset['review']
Y=sampled_dataset['Y'].values


In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=5000, random_state=seed)

In [None]:
X_train

12838    In the 3rd installment of Left Behind the make...
4772     The sun was not shining it was too wet to play...
5342     I absolutely ADORED this movie as a child and ...
9333     From what I understand Fox was embarrassed the...
6499     The Beauty The Terror The Poetry The Horror Th...
                               ...                        
13123    Busty beauty Stacie Randall plays PVC clad bad...
3264     This movie was rented by a friend Her choice i...
9845     Me and a friend rented this movie because it s...
10799    A romp across a disbelieving outback this outr...
2732     What a disappointmentbr br This film seemed to...
Name: review, Length: 10000, dtype: object

**TOKENISATION AND PADDING**

In [None]:
max_features=10000
max_len=500

X_train = X_train.map(lambda x: x.lower())
X_test = X_test.map(lambda x: x.lower())

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=seed)

**BUILDING LSTM MODEL**

In [None]:
model = Sequential()


model.add(Embedding(max_features,100,mask_zero=True))
model.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False)) 
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 64)          42240     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,054,689
Trainable params: 1,054,689
Non-trainable params: 0
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


In [None]:
epochs = 4
batch_size = 32

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_tr, Y_tr, validation_data=(X_val, Y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f265b1d0350>

**PREDICTION**

In [None]:
from sklearn.metrics import classification_report
import numpy as np

y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(Y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.51      1.00      0.67      2537
           1       0.00      0.00      0.00      2463

    accuracy                           0.51      5000
   macro avg       0.25      0.50      0.34      5000
weighted avg       0.26      0.51      0.34      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
