In [1]:
import pandas as pd
import re
import nltk
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense,GlobalMaxPooling1D,LSTM
from tensorflow.keras.models import Sequential

In [2]:
df= pd.read_csv("IMDB_Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [3]:
df.shape

(50000, 2)

In [4]:
##cleaning up the text
stopWords = set(nltk.corpus.stopwords.words("english"))
def CleanText(text):
    text = text.lower() 
    text = re.sub("[^a-z]", " ", text)
    words = [word for word in text.split() if word not in stopWords]
    text = " ".join(words)
    return text

df["review"]=df.apply(lambda row: CleanText(row["review"]), axis=1)
Data = df.review
labels = df.sentiment

In [5]:
### converting 1000 most frequent words to integer
frequentWord = 10000
tokenizer = Tokenizer(num_words=frequentWord)

tokenizer.fit_on_texts(Data)

IntegerizedData = pd.Series(tokenizer.texts_to_sequences(Data))
IntegerizedData.head()

0    [315, 6873, 3237, 2, 117, 996, 1569, 1033, 117...
1    [342, 2080, 100, 30, 3711, 232, 11, 66, 165, 5...
2    [20, 595, 4986, 24, 8579, 369, 844, 7247, 2536...
3    [8, 5767, 24, 749, 582, 180, 304, 147, 2329, 2...
4    [8836, 1155, 25, 3325, 521, 11, 535, 9224, 920...
dtype: object

In [6]:
### You can convert the sequences back to the string 
tokenizer.sequences_to_texts(IntegerizedData[0:1])

['mr costner dragged movie far longer necessary aside terrific sea rescue sequences care characters us ghosts closet costner character realized early forgotten much later time care character really care cocky ashton kutcher problem comes kid thinks better anyone else around shows signs closet appears winning costner finally well past half way point stinker costner tells us kutcher ghosts told kutcher driven best prior magic could keep turning hour']

In [7]:
DataLength=IntegerizedData.apply(lambda x: len(x))
DataLength.describe()

count    50000.000000
mean       110.157100
std         81.213625
min          3.000000
25%         60.000000
50%         83.000000
75%        135.000000
max       1104.000000
dtype: float64

In [8]:
#zero padding
IntegerizedData= pad_sequences(IntegerizedData, maxlen=256)


In [9]:
IntegerizedData

array([[   0,    0,    0, ...,  269, 1456,  397],
       [   0,    0,    0, ...,  363,    3,   80],
       [   0,    0,    0, ...,   54,   57,   76],
       ...,
       [   0,    0,    0, ...,  973,   99, 9907],
       [   0,    0,    0, ..., 8508, 3506,   13],
       [   0,    0,    0, ...,   11, 2263,   23]])

In [10]:
x_train_val,x_test,y_train_val,y_test = train_test_split(IntegerizedData,labels,stratify=labels,test_size=0.20, random_state=42)
x_train,x_val,y_train,y_val = train_test_split(x_train_val,y_train_val,stratify=y_train_val,test_size=0.10, random_state=42)

In [11]:
type(x_val)

numpy.ndarray

In [12]:
print(y_train.shape,y_val.shape,y_test.shape)

(36000,) (4000,) (10000,)


In [13]:
print(x_train.shape,x_val.shape,x_test.shape)

(36000, 256) (4000, 256) (10000, 256)


a) Use LSTM with hidden_dimension=64 followed by a one neuron FC layer with a sigmoid
activation.

In [14]:
model = Sequential()

# Add an embedding layer
model.add(Embedding(input_dim=10000, output_dim=64))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))
optimizer = Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Print the summary of the model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          640000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 673,089
Trainable params: 673,089
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Train the model
pred_history=model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [None]:
test_accuracy = model.evaluate(x_test, y_test, verbose=0)
print('Test Accuracy: %.2f' % (test_accuracy[1] * 100.0))

In [None]:
val_accuracy = model.evaluate(x_val, y_val, verbose=0)
print('Val Accuracy: %.2f' % (val_accuracy[1] * 100.0))

In [None]:
#Plot Train vs Validation Accuracy
def TrainVsVal_acc(pred_history):
    plt.figure(figsize=(4, 6))
    plt.plot(np.arange(0, 20), pred_history.history["accuracy"], label="train_acc")
    plt.plot(np.arange(0, 20), pred_history.history["val_accuracy"], label="val_acc")
    plt.title("Training Accuracy vs Validation Accuracy")
    plt.xlabel("Number of Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

In [None]:
#Plot Train vs Validation Loss
def trainVsVal_Loss(pred_history):
    plt.figure(figsize=(4, 6))
    plt.plot(np.arange(0, 20), pred_history.history["loss"], label="train_loss")
    plt.plot(np.arange(0, 20), pred_history.history["val_loss"], label="val_loss")
    plt.title("Training Loss vs Validation Loss")
    plt.xlabel("Number of Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [None]:
print(" Train vs Val Accuracy for LSTM")
TrainVsVal_acc(pred_history)
print(" Train vs Val loss for LSTM")
trainVsVal_Loss(pred_history)

b)Use LSTM with hidden_dimension=64, followed by Global maxpool 1d, followed by FC with
16 neurons with ReLU, followed by FC layer with single output with sigmoid function.

In [None]:
# Define the model architecture
model_LSTM2 = Sequential()
model_LSTM2.add(Embedding(input_dim=10000, output_dim=64))
model_LSTM2.add(LSTM(units=64, return_sequences=True))
model_LSTM2.add(GlobalMaxPooling1D())
model_LSTM2.add(Dense(units=16, activation='relu'))
model_LSTM2.add(Dense(units=1, activation='sigmoid'))

# Compile the model with binary cross-entropy loss and Adam optimizer
model_LSTM2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the summary of the model architecture
model_LSTM2.summary()


In [None]:
# Train the model
pred_history2=model_LSTM2.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=64)

In [None]:
test_accuracy = model_LSTM2.evaluate(x_test, y_test, verbose=0)
print('Test Accuracy: %.2f' % (test_accuracy[1] * 100.0))

In [None]:
val_accuracy = model_LSTM2.evaluate(x_val, y_val, verbose=0)
print('Val Accuracy: %.2f' % (val_accuracy[1] * 100.0))

In [None]:
print(" Train vs Val Accuracy for LSTM2")
TrainVsVal_acc(pred_history2)
print(" Train vs Val loss for LSTM2")
trainVsVal_Loss(pred_history2)

In [None]:

# Define the model architecture
model_LSTM3 = Sequential()
model_LSTM3.add(Embedding(input_dim=10000, output_dim=64))
model_LSTM3.add(LSTM(units=64, return_sequences=True))
model_LSTM3.add(LSTM(units=64, return_sequences=True))
model_LSTM3.add(GlobalMaxPooling1D())
model_LSTM3.add(Dense(units=16, activation='relu'))
model_LSTM3.add(Dense(units=1, activation='sigmoid'))

# Compile the model with binary cross-entropy loss and Adam optimizer
model_LSTM3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the summary of the model architecture
model_LSTM3.summary()


In [None]:
# Train the model
pred_history3=model_LSTM3.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=64)

In [None]:
test_accuracy = model_LSTM3.evaluate(x_test, y_test, verbose=0)
print('Test Accuracy: %.2f' % (test_accuracy[1] * 100.0))

In [None]:
val_accuracy = model_LSTM3.evaluate(x_val, y_val, verbose=0)
print('Val Accuracy: %.2f' % (val_accuracy[1] * 100.0))

In [None]:
print(" Train vs Val Accuracy for LSTM3")
TrainVsVal_acc(pred_history3)
print(" Train vs Val loss for LSTM3")
trainVsVal_Loss(pred_history3)