In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import os

In [2]:
df = pd.read_csv('Downloads/IMDB Dataset.csv')
X = df['review']
y = df['sentiment'].map({'positive': 1, 'negative': 0})

In [3]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Text Preprocessing for Classical Models
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')  # Stopword removal and vectorization
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [5]:
# Text Preprocessing for LSTM
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)  # Tokenization
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)  # Padding
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [6]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

In [7]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

In [8]:
# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(max_words, 32))  # Removed input_length
lstm_model.add(LSTM(100))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
import time

In [11]:
# TensorBoard callback
log_dir = "logs/fit/" + time.strftime("%Y%m%d-%H%M%S")  # Use time.strftime for a unique directory name
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Train LSTM
history = lstm_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test),
                        epochs=5, batch_size=32, callbacks=[tensorboard_callback])

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.7473 - loss: 0.4859 - val_accuracy: 0.8505 - val_loss: 0.3500
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.8796 - loss: 0.2889 - val_accuracy: 0.8665 - val_loss: 0.3123
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.9012 - loss: 0.2436 - val_accuracy: 0.8702 - val_loss: 0.3069
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.9191 - loss: 0.2051 - val_accuracy: 0.8685 - val_loss: 0.3133
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - accuracy: 0.9350 - loss: 0.1680 - val_accuracy: 0.8672 - val_loss: 0.3269


In [14]:
# Evaluate Classical Models
nb_metrics = {
    'precision': precision_score(y_test, nb_pred),
    'recall': recall_score(y_test, nb_pred),
    'f1': f1_score(y_test, nb_pred)
}
lr_metrics = {
    'precision': precision_score(y_test, lr_pred),
    'recall': recall_score(y_test, lr_pred),
    'f1': f1_score(y_test, lr_pred)
}

In [15]:
# Evaluate LSTM
lstm_pred = (lstm_model.predict(X_test_pad) > 0.5).astype(int)
lstm_metrics = {
    'precision': precision_score(y_test, lstm_pred),
    'recall': recall_score(y_test, lstm_pred),
    'f1': f1_score(y_test, lstm_pred)
}

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


In [16]:
print("Naive Bayes Metrics:", nb_metrics)
print("Logistic Regression Metrics:", lr_metrics)
print("LSTM Metrics:", lstm_metrics)

Naive Bayes Metrics: {'precision': 0.8513460015835312, 'recall': 0.8535423695177614, 'f1': 0.8524427707858487}
Logistic Regression Metrics: {'precision': 0.8780065422359054, 'recall': 0.9055368128596943, 'f1': 0.891559202813599}
LSTM Metrics: {'precision': 0.8488437676254935, 'recall': 0.8960111133161341, 'f1': 0.8717899208341379}


In [17]:
# Save results
with open('results.txt', 'w') as f:
    f.write(f"Naive Bayes Metrics: {nb_metrics}\n")
    f.write(f"Logistic Regression Metrics: {lr_metrics}\n")
    f.write(f"LSTM Metrics: {lstm_metrics}\n")

In [18]:
%load_ext tensorboard
%tensorboard --logdir logs/fit