In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

  if not hasattr(np, "object"):


In [2]:
df = pd.read_csv("IMDB Dataset.csv")
print(df.shape)
print(df.dtypes)
df.head()

(50000, 2)
review       object
sentiment    object
dtype: object


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
label = LabelEncoder()
df['sentiment']=label.fit_transform(df['sentiment'])

X=df['review'].values
y=df['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [4]:
# tokenization (converting text to integer sequence by considering the most frequent words to reduce noise and memory)

max_words = 10000  
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>") # out-of-vocabulary words instead of completely ignoring the top 10,000 words
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

#standardizing sequence lengths
#since neural networks require fixed size inputs( Pad shorter sequences and truncate longer ones to same length)
max_len = 200  
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post") # post adds 0s at the end while padding



In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_words,output_dim=128,input_length=max_len)) #converting word indices to dense vectors
# 128 as it is word embedding size
model.add(LSTM(units=128,return_sequences=False)) #getting sequential dependencies
model.add(Dropout(0.5)) #prevents over-fitting
model.add(Dense(1, activation="sigmoid")) #output layer for binary classification, sigmoid returns a value between 0 & 1
model.compile(optimizer=Adam(learning_rate=0.001),loss="binary_crossentropy",metrics=["accuracy"]) #binary_crossentropy chosen for binary classification
# standard and stable learning rate

In [None]:
train_model = model.fit(X_train_pad,y_train,epochs=5,batch_size=64, validation_split=0.2) # epochs chosen to avoid over-fitting
# validation set taken to tune hyperparameters and detect over-fitting
y_pred_prob=model.predict(X_test_pad)
y_pred=(y_pred_prob>=0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 151ms/step - accuracy: 0.9198 - loss: 0.2220 - val_accuracy: 0.8823 - val_loss: 0.3031
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 150ms/step - accuracy: 0.9422 - loss: 0.1740 - val_accuracy: 0.8794 - val_loss: 0.3322
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 593ms/step - accuracy: 0.9580 - loss: 0.1332 - val_accuracy: 0.8811 - val_loss: 0.3457
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 157ms/step - accuracy: 0.9704 - loss: 0.1040 - val_accuracy: 0.8810 - val_loss: 0.3824
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 379ms/step - accuracy: 0.9788 - loss: 0.0792 - val_accuracy: 0.8781 - val_loss: 0.4238
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 38ms/step
Accuracy: 0.8758
              precision    recall  f1-score   support

           0       0.89      

In [10]:
# consistent and unbiased predictions concluded from the classification report
model.summary

<bound method Model.summary of <Sequential name=sequential_1, built=True>>