In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Flatten

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [6]:
#before data cleaning
data['review'][200]

"Interesting and short television movie describes some of the machinations surrounding Jay Leno's replacing Carson as host of the Tonight Show. Film is currently very topical given the public drama surrounding Conan O'Brien and Jay Leno.<br /><br />The film does a good job of sparking viewers' interest in the events and showing some of the concerns of the stakeholders, particularly of the NBC executives. The portrayal of Ovitz was particularly compelling and interesting, I thought.<br /><br />Still, many of the characters were only very briefly limned or touched upon, and some of the acting seemed perfunctory. Nevertheless, an interesting story."

In [7]:
#data cleaning
def preprocessing_text(text):
    #removing html tags
    re_tags = re.compile(r'<[^>]+>')
    new_text = re_tags.sub('', text)
    
    #removing punctuations and numbers
    new_text = re.sub('[^a-zA-Z]', ' ', new_text)
    
    #removing single character 
    new_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_text)


    #removing multiple spaces
    new_text = re.sub(r'\s+', ' ', new_text)
    
    return new_text

In [8]:
X = []
texts = list(data['review'])
for text in texts:
    X.append(preprocessing_text(text))

In [9]:
#after data cleaning
X[200]

'Interesting and short television movie describes some of the machinations surrounding Jay Leno replacing Carson as host of the Tonight Show Film is currently very topical given the public drama surrounding Conan Brien and Jay Leno The film does good job of sparking viewers interest in the events and showing some of the concerns of the stakeholders particularly of the NBC executives The portrayal of Ovitz was particularly compelling and interesting thought Still many of the characters were only very briefly limned or touched upon and some of the acting seemed perfunctory Nevertheless an interesting story '

In [10]:
#label encoding for y value
le = LabelEncoder()

In [11]:
y = le.fit_transform(data.sentiment)

In [12]:
#splitting datasets into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [13]:
#word-to-index dictionary
tokenizer = Tokenizer(num_words=1500, split=' ')

In [14]:
tokenizer.fit_on_texts(X_train)

In [15]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [16]:
#maximum length of list
max_len = max([len(x) for x in X_train])
print(max_len)

1763


In [17]:
#minimum lenght of list
min_len = min([len(x) for x in X_train])
print(min_len)

3


In [18]:
vocab_size = len(tokenizer.word_index) // 2

maxlen = 200

In [19]:
#padding data for equal length
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [20]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(vocab_size, embed_dim,input_length = maxlen))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [21]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 128)          5899904   
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
flatten (Flatten)            (None, 196)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 197       
Total params: 6,154,901
Trainable params: 6,154,901
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
batch_size = 32
model.fit(X_train, y_train,
          validation_data = (X_test,y_test), 
          epochs = 5, 
          batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f42daf16750>