In [29]:
!pip show tensorflow

Name: tensorflow
Version: 2.18.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: C:\Users\praka\AppData\Local\Programs\Python\Python312\Lib\site-packages
Requires: tensorflow-intel
Required-by: 


In [30]:
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [31]:
X_train_text, X_test_text, y_train, y_test, vectorizer = pd.read_pickle("../datasets/processed_data.pkl")

In [32]:
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train_text, y_train)
log_y_pred = log_model.predict(X_test_text)
log_accuracy = accuracy_score(y_test, log_y_pred)
print(f"Logistic Regression Accuracy: {log_accuracy * 100:.2f}%")
print("Logistic Regression Report:\n", classification_report(y_test, log_y_pred))

Logistic Regression Accuracy: 89.70%
Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.90      4071
           1       0.90      0.89      0.90      4016

    accuracy                           0.90      8087
   macro avg       0.90      0.90      0.90      8087
weighted avg       0.90      0.90      0.90      8087



In [33]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_text, y_train)
rf_y_pred = rf_model.predict(X_test_text)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Report:\n", classification_report(y_test, rf_y_pred))

Random Forest Accuracy: 88.35%
Random Forest Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      4071
           1       0.88      0.89      0.88      4016

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087



In [34]:
nb_model = MultinomialNB()
nb_model.fit(X_train_text, y_train)
nb_y_pred = nb_model.predict(X_test_text)
nb_accuracy = accuracy_score(y_test, nb_y_pred)
print(f"Naïve Bayes Accuracy: {nb_accuracy * 100:.2f}%")
print("Naïve Bayes Report:\n", classification_report(y_test, nb_y_pred))

Naïve Bayes Accuracy: 86.97%
Naïve Bayes Report:
               precision    recall  f1-score   support

           0       0.89      0.84      0.87      4071
           1       0.85      0.90      0.87      4016

    accuracy                           0.87      8087
   macro avg       0.87      0.87      0.87      8087
weighted avg       0.87      0.87      0.87      8087



In [35]:
MAX_NUM_WORDS = 5000
MAX_SEQUENCE_LENGTH = 200

In [37]:
from scipy.sparse import issparse

if issparse(X_train_text):
    X_train_text = X_train_text.toarray()


In [39]:
# Convert sparse matrix to text
X_train_text = vectorizer.inverse_transform(X_train_text)
X_train_text = [" ".join(words) for words in X_train_text]  # Convert list of word arrays into sentences

X_test_text = vectorizer.inverse_transform(X_test_text)
X_test_text = [" ".join(words) for words in X_test_text]  # Same for test data

# Tokenization
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train_text)  

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)  
X_test_seq = tokenizer.texts_to_sequences(X_test_text)  

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)


In [41]:
lstm_model = Sequential([
    Embedding(MAX_NUM_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

In [42]:
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
lstm_model.fit(X_train_pad, np.array(y_train), epochs=5, batch_size=32, validation_data=(X_test_pad, np.array(y_test)))

Epoch 1/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 122ms/step - accuracy: 0.8078 - loss: 0.3881 - val_accuracy: 0.8885 - val_loss: 0.2639
Epoch 2/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 140ms/step - accuracy: 0.9225 - loss: 0.1909 - val_accuracy: 0.8924 - val_loss: 0.2567
Epoch 3/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 190ms/step - accuracy: 0.9376 - loss: 0.1525 - val_accuracy: 0.8841 - val_loss: 0.2768
Epoch 4/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 185ms/step - accuracy: 0.9506 - loss: 0.1254 - val_accuracy: 0.8784 - val_loss: 0.3112
Epoch 5/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 174ms/step - accuracy: 0.9588 - loss: 0.1052 - val_accuracy: 0.8692 - val_loss: 0.3525


<keras.src.callbacks.history.History at 0x22f0b820620>

In [45]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, np.array(y_test))
print(f"LSTM Model Accuracy: {lstm_accuracy * 100:.2f}%")

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 48ms/step - accuracy: 0.8733 - loss: 0.3486
LSTM Model Accuracy: 86.92%


In [46]:
print(f"Vocabulary Size: {len(tokenizer.word_index)}")
X_test_pad = pad_sequences(X_test_seq, maxlen=X_train_pad.shape[1])

Vocabulary Size: 5005


In [47]:
X_test_pad = vectorizer.transform(X_test_text)

In [52]:
accuracy = best_model.score(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 86.50%


In [54]:
from sklearn.metrics import classification_report

y_pred = best_model.predict(X_test_pad)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.94      0.88      4071
           1       0.93      0.79      0.85      4016

    accuracy                           0.86      8087
   macro avg       0.87      0.86      0.86      8087
weighted avg       0.87      0.86      0.86      8087



In [55]:
import pandas as pd

X_test_text_reset = pd.Series(X_test_text).reset_index(drop=True)


In [57]:
# Find misclassified indices
misclassified_idx = np.where(y_test != y_pred)[0]

# Reset index of y_test
y_test_reset = y_test.reset_index(drop=True)

# Print misclassified examples
for idx in misclassified_idx[:5]:  
    print(f"Actual: {y_test_reset[idx]}, Predicted: {y_pred[idx]}")  
    print(f"Review: {X_test_text[idx]}")  # No need for .reset_index()
    print("-" * 80)

Actual: 1, Predicted: 0
Review: would with will when way was wants wait very two to this the strong story so she series see romance review relationship recommend received real reading read plot pace out on of next my mother more meets man main loved love looking look know it is in how honest him her happy happily handsome great good get fun from free forward for flowed first find ever escape doesn do development definitely copy comes characters character can by but book between be author at anything anyone and an after
--------------------------------------------------------------------------------
Actual: 1, Predicted: 0
Review: your yet year with will white use trailer to this then the so see provide problem over or option on off of not no meant long light last it issues is in have had goes for feet enough does dimmable color changing change camping bulb bright blue any and again
--------------------------------------------------------------------------------
Actual: 1, Predicted: 0


In [58]:
best_model = max(
    [(log_model, log_accuracy), (rf_model, rf_accuracy), (nb_model, nb_accuracy)], key=lambda x: x[1]
)[0]

In [59]:
if lstm_accuracy > max(log_accuracy, rf_accuracy, nb_accuracy):
    lstm_model.save("../models/lstm_model.h5")
    best_model = "LSTM Model"
else:
    with open("../models/model.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)
    with open("../models/vectorizer.pkl", "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

In [60]:
print(f"Best model selected: {best_model}")
print("Model training complete.")

Best model selected: LogisticRegression(max_iter=500)
Model training complete.
