In [1]:
# Step 1: Install and Import
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



In [None]:
# Step 2: Load Dataset
df = pd.read_csv('cleaned_balanced.csv')  # Replace with your file
df = df[['Review', 'Rating']].dropna()

# If ratings are not 0-based, subtract 1 to get class labels 0-4
df['Rating'] = df['Rating'].astype(int) - 1

In [85]:
df['Rating'].value_counts()

Rating
2    22999
3    22998
4    22997
1    22997
0    22994
Name: count, dtype: int64

In [3]:
# Step 3: Preprocess Text
X = df['Review'].astype(str).values
y = df['Rating'].values

# Step 4: Encode Labels
num_classes = len(np.unique(y))
y = tf.keras.utils.to_categorical(y, num_classes)

In [4]:
# Step 5: Tokenize and Pad
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
max_len = 100
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [6]:
# Step 7: Load GloVe Embedding
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_dim = 100
vocab_size = min(10000, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D, BatchNormalization

model = Sequential([
    # Pretrained GloVe Embedding
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),

    # First BiLSTM Block
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.4),  # Slightly higher dropout
    BatchNormalization(),

    # Second BiLSTM Block
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),

    # Global Max Pooling to reduce sequence dimension
    GlobalMaxPooling1D(),

    # Fully Connected Layers
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),

    # Output Layer
    Dense(num_classes, activation='softmax')
])



In [8]:
# Step 9: Compile and Train
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
model.summary()

In [10]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=64,
    verbose=1
)

Epoch 1/10
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m344s[0m 232ms/step - accuracy: 0.3796 - loss: 1.3932 - val_accuracy: 0.4274 - val_loss: 1.3222
Epoch 2/10
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 253ms/step - accuracy: 0.4338 - loss: 1.2896 - val_accuracy: 0.4481 - val_loss: 1.2965
Epoch 3/10
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 377ms/step - accuracy: 0.4543 - loss: 1.2474 - val_accuracy: 0.4368 - val_loss: 1.3185
Epoch 4/10
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 524ms/step - accuracy: 0.4685 - loss: 1.2159 - val_accuracy: 0.4737 - val_loss: 1.2527
Epoch 5/10
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m795s[0m 553ms/step - accuracy: 0.4790 - loss: 1.1934 - val_accuracy: 0.4690 - val_loss: 1.2598
Epoch 6/10
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 542ms/step - accuracy: 0.4878 - loss: 1.1731 - val_accuracy: 0.4785 - val_loss:

In [16]:
model.save("balancemodel.h5")



In [17]:
import pickle

# Save tokenizer to disk
with open("balancetoc.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [26]:
# Load new model and tokenizer (replace with your file paths)
from tensorflow.keras.models import load_model
new_model_path = 'balancemodel.h5'  # Update with actual path
new_tokenizer_path = 'balancetoc.pkl'  # Update with actual path

# Load new model
new_model = load_model(new_model_path)

# Load new tokenizer
with open(new_tokenizer_path, 'rb') as f:
    new_tokenizer = pickle.load(f)



In [27]:
df1=pd.read_csv('cleaned_imbalanced.csv')

In [28]:
df1['Review']=df1['Review'].astype(str)

In [29]:
# Tokenize and pad sequences
sequences = new_tokenizer.texts_to_sequences(df1['Review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [30]:
from tensorflow.keras.utils import to_categorical
# Step 3: Prepare ratings (1-5 to 0-4 for one-hot encoding)
true_labels = df1['Rating'].values - 1  # Shift ratings from 1-5 to 0-4
true_labels_one_hot = to_categorical(true_labels, num_classes=num_classes)

In [31]:
# Step 4: Make predictions
predictions = new_model.predict(padded_sequences)
predicted_labels = np.argmax(predictions, axis=1)

[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 55ms/step


In [32]:
# Step 6: Detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=['1', '2', '3', '4', '5']))


Classification Report:
              precision    recall  f1-score   support

           1       0.75      0.59      0.66     30000
           2       0.26      0.23      0.25     12000
           3       0.37      0.23      0.28     18000
           4       0.40      0.44      0.42     24000
           5       0.59      0.81      0.68     36000

    accuracy                           0.53    120000
   macro avg       0.48      0.46      0.46    120000
weighted avg       0.53      0.53      0.52    120000

