In [None]:
import pandas as pd

In [1]:
# Step 1: Install and Import
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



In [None]:
# Step 2: Load Dataset
df = pd.read_csv('cleaned_imbalanced.csv')  # Replace with your file
df = df[['Review', 'Rating']].dropna()

#If ratings are not 0-based, subtract 1 to get class labels 0-4
df['Rating'] = df['Rating'].astype(int) - 1

In [35]:
df['Rating'].value_counts()

Rating
4    35993
0    29989
3    23999
2    17996
1    11998
Name: count, dtype: int64

In [3]:
# Step 3: Preprocess Text
X = df['Review'].astype(str).values
y = df['Rating'].values

# Step 4: Encode Labels
num_classes = len(np.unique(y))
y = tf.keras.utils.to_categorical(y, num_classes)

In [4]:
# Step 5: Tokenize and Pad
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
max_len = 100
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [6]:
# Step 7: Load GloVe Embedding
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_dim = 100
vocab_size = min(10000, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D, BatchNormalization

model = Sequential([
    # Pretrained GloVe Embedding
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),

    # First BiLSTM Block
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.4),  # Slightly higher dropout
    BatchNormalization(),

    # Second BiLSTM Block
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),

    # Global Max Pooling to reduce sequence dimension
    GlobalMaxPooling1D(),

    # Fully Connected Layers
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),

    # Output Layer
    Dense(num_classes, activation='softmax')
])



In [8]:
# Step 9: Compile and Train
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
model.summary()

In [10]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=64,
    verbose=1
)

Epoch 1/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 289ms/step - accuracy: 0.4811 - loss: 1.2768 - val_accuracy: 0.5057 - val_loss: 1.2679
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 316ms/step - accuracy: 0.5225 - loss: 1.1801 - val_accuracy: 0.5325 - val_loss: 1.2031
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 290ms/step - accuracy: 0.5395 - loss: 1.1392 - val_accuracy: 0.5428 - val_loss: 1.1822
Epoch 4/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m476s[0m 317ms/step - accuracy: 0.5508 - loss: 1.1130 - val_accuracy: 0.5486 - val_loss: 1.1554
Epoch 5/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 331ms/step - accuracy: 0.5593 - loss: 1.0910 - val_accuracy: 0.5489 - val_loss: 1.1457
Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 414ms/step - accuracy: 0.5649 - loss: 1.0710 - val_accuracy: 0.5506 - val_loss:

In [64]:
model.save("imbalancemodel.h5")



In [65]:
import pickle

# Save tokenizer to disk
with open("imbalancetoc.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [73]:
# Load new model and tokenizer (replace with your file paths)
from tensorflow.keras.models import load_model
new_model_path = 'imbalancemodel.h5'  # Update with actual path
new_tokenizer_path = 'imbalancetoc.pkl'  # Update with actual path

# Load new model
new_model = load_model(new_model_path)

# Load new tokenizer
with open(new_tokenizer_path, 'rb') as f:
    new_tokenizer = pickle.load(f)



In [74]:
df1=pd.read_csv('cleaned_balanced.csv')

In [75]:
df1['Review']=df1['Review'].astype(str)

In [76]:
# Tokenize and pad sequences
sequences = new_tokenizer.texts_to_sequences(df1['Review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [77]:
from tensorflow.keras.utils import to_categorical
# Step 3: Prepare ratings (1-5 to 0-4 for one-hot encoding)
true_labels = df1['Rating'].values - 1  # Shift ratings from 1-5 to 0-4
true_labels_one_hot = to_categorical(true_labels, num_classes=num_classes)

In [78]:
# Step 4: Make predictions
predictions = new_model.predict(padded_sequences)
predicted_labels = np.argmax(predictions, axis=1)

[1m3594/3594[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 40ms/step


In [79]:
from sklearn.metrics import accuracy_score
# Step 5: Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.4523


In [80]:
# Step 6: Detailed metrics
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=['1', '2', '3', '4', '5']))


Classification Report:
              precision    recall  f1-score   support

           1       0.41      0.88      0.56     23000
           2       0.43      0.01      0.01     23000
           3       0.39      0.36      0.37     23000
           4       0.47      0.29      0.36     23000
           5       0.56      0.73      0.63     23000

    accuracy                           0.45    115000
   macro avg       0.45      0.45      0.39    115000
weighted avg       0.45      0.45      0.39    115000

