In [None]:
import re
import math
import nltk
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
#Remove stopwords from dataframe reviews
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(input_text, stop_words):
    words = input_text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['review'] = df['review'].apply(lambda x: remove_stopwords(x, stop_words))
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought wonderful way spend time hot summer ...,positive
3,Basically there's family little boy (Jake) thi...,negative
4,"Petter Mattei's ""Love Time Money"" visually stu...",positive


In [None]:
#Tokenize and lemmatize dataframe reviews
nltk.download('wordnet')
word_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    string = ""
    for word in word_tokenizer.tokenize(text):
        string = string + lemmatizer.lemmatize(word) + " "
    return string

df['review'] = df.review.apply(lemmatize_text)
df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,sentiment
0,One reviewer mentioned watching 1 Oz episode h...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought wonderful way spend time hot summer ...,positive
3,Basically there's family little boy (Jake) thi...,negative
4,"Petter Mattei's ""Love Time Money"" visually stu...",positive


In [None]:
reviews = df['review'].values
labels = df['sentiment'].values

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [None]:
#Perform train-test split, ensuring that data distribution of labels is maintained
train_inputs, test_inputs, train_labels, test_labels = train_test_split(reviews, encoded_labels, test_size = 0.2, stratify = encoded_labels)

In [None]:
#Define hyperparameters
vocab_size = 3000
embedding_dim = 100
max_length = 300
padding_type = 'post'
truncation = 'post'
oov = '' #out of vocabulary tokens
no_epochs = 5

#Tokenize train dataset
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov)
tokenizer.fit_on_texts(train_inputs)
word_index = tokenizer.word_index

train_sentences = tokenizer.texts_to_sequences(train_inputs)
train_padded = pad_sequences(train_sentences, padding = 'post', maxlen = max_length)

test_sentences = tokenizer.texts_to_sequences(test_inputs)
test_padded = pad_sequences(test_sentences, padding = 'post', maxlen = max_length)

In [None]:
#Model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

#Model summary
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 300, 100)          300000    
                                                                 
 bidirectional_12 (Bidirect  (None, 128)               84480     
 ional)                                                          
                                                                 
 dense_24 (Dense)            (None, 24)                3096      
                                                                 
 dense_25 (Dense)            (None, 1)                 25        
                                                                 
Total params: 387601 (1.48 MB)
Trainable params: 387601 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(train_padded, train_labels, epochs = no_epochs, verbose = 1, validation_split = 0.15)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x793f173f1e70>

In [None]:
#Test model performance on test dataset
predictions = model.predict(test_padded)

predicted_labels = []
threshold = 0.5

for i in predictions:
    if i >= threshold:
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

unique_labels = set(test_labels)

for label in unique_labels:
    true_labels_label = [1 if l == label else 0 for l in test_labels]
    predicted_labels_label = [1 if l == label else 0 for l in predicted_labels]

    accuracy_label = accuracy_score(true_labels_label, predicted_labels_label)
    precision_label = precision_score(true_labels_label, predicted_labels_label)
    recall_label = recall_score(true_labels_label, predicted_labels_label)
    f1_label = f1_score(true_labels_label, predicted_labels_label)

    print(f"\nLabel {label}:")
    print(f"  Accuracy: {accuracy_label:.4f}")
    print(f"  Precision: {precision_label:.4f}")
    print(f"  Recall: {recall_label:.4f}")
    print(f"  F1 Score: {f1_label:.4f}")

#Overall accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"\nOverall Accuracy: {accuracy}")

#Generate confusion matrix
conf_matrix = confusion_matrix(test_labels, predicted_labels)
print("\nConfusion Matrix:")
print(conf_matrix)


Label 0:
  Accuracy: 0.8797
  Precision: 0.8662
  Recall: 0.8982
  F1 Score: 0.8819

Label 1:
  Accuracy: 0.8797
  Precision: 0.8943
  Recall: 0.8612
  F1 Score: 0.8774

Overall Accuracy: 0.8797

Confusion Matrix:
[[4491  509]
 [ 694 4306]]
