In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

In [2]:
lb = LabelBinarizer()

In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [4]:
df.sentiment = lb.fit_transform(df['sentiment'])

In [5]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
df['review']=df['review'].apply(denoise_text)



In [6]:
#Define function for removing special characters 
def remove_special_characters(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = text.lower()
    text = text.split()
    return text
#Apply function on review column
df['review']=df['review'].apply(remove_special_characters)

In [7]:

stop_words = set(stopwords.words('English'))
ps = PorterStemmer()
def remove_stopwords(text):
    text = [ps.stem(x) for x in text if x not in stop_words]
    return text

df['review'] = df['review'].apply(remove_stopwords)

In [8]:
num_of_words = 10000
tokenizer = Tokenizer(num_words = num_of_words)

In [10]:
tokenizer.fit_on_texts(df.review)

In [11]:
tokens = tokenizer.texts_to_sequences(df.review)

In [14]:
num_tokens = [len(token) for token in tokens]
num_tokens = np.array(num_tokens)

In [15]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

274

In [16]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9452

In [18]:
updated_tokens = pad_sequences(tokens ,maxlen=max_tokens , padding='pre')

In [19]:
X_train , X_test , Y_train, Y_test = train_test_split(updated_tokens , df.sentiment.values , train_size = 0.8)

In [20]:
model = keras.Sequential([
    keras.layers.Embedding(input_dim=num_of_words,
                    output_dim=128,
                    input_length=max_tokens),
    keras.layers.LSTM(128,dropout=0.2,return_sequences =True),
    keras.layers.LSTM(64,dropout=0.2,return_sequences =True),
    keras.layers.LSTM(32),
    keras.layers.Dense(1,activation = 'sigmoid')
])

In [21]:
model.compile(optimizer='adam' , loss='binary_crossentropy' , metrics=['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 274, 128)          1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 274, 128)          131584    
_________________________________________________________________
lstm_1 (LSTM)                (None, 274, 64)           49408     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 1,473,441
Trainable params: 1,473,441
Non-trainable params: 0
_________________________________________________________________


In [23]:
mcp_save = ModelCheckpoint('bestmodelIMDB.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [24]:
batch_size = 64
model.fit(X_train, Y_train, epochs = 16, batch_size=batch_size,validation_split=0.05,verbose = True,callbacks =[mcp_save])

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x217a7a20220>

In [27]:
predicted = model.predict(X_test)

In [31]:
model.evaluate(X_test,Y_test)



[0.6325933337211609, 0.859000027179718]