In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df=pd.read_csv('balanced_data_for_DL.csv')

In [5]:
df=df.drop(columns='Unnamed: 0')

In [6]:
df['Review'] = df['Review'].str.lower()

In [7]:

import re

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)  # remove emojis
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text


In [8]:
df['Review']=df['Review'].apply(clean_text)

In [7]:
!pip install -q transformers datasets accelerate tensorboard spacy tqdm
!python -m spacy download en_core_web_sm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m123.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os, re, random, numpy as np, pandas as pd, tensorflow as tf, spacy, tqdm

In [3]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # speed: no NER/POS
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

In [9]:
def spacy_lemma(texts):
    cleaned = []
    for doc in tqdm.tqdm(nlp.pipe(texts, batch_size=1024), total=len(texts)):
        tokens = [t.lemma_.lower() for t in doc
                  if t.is_alpha and not t.is_stop]
        cleaned.append(" ".join(tokens))
    return cleaned

In [10]:
df["Review"] = spacy_lemma(df["Review"].astype(str))


100%|██████████| 115000/115000 [10:11<00:00, 187.91it/s]


In [11]:
df.to_csv("cleaned_balanced.csv", encoding="utf-8-sig")

In [12]:
from tensorflow.keras.utils import to_categorical
# Ensure columns are named correctly
texts = df['Review'].astype(str).values
labels = df['Rating'].values-1

# Encode labels if not already integers (optional, if categorical)
num_classes = len(np.unique(labels))
labels = to_categorical(labels, num_classes=num_classes)


In [13]:
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
# Tokenization
vocab_size = 10000
max_len = 100  # You can tune this
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [16]:
X_train_pad

array([[  12,   22,    3, ...,    0,    0,    0],
       [ 716,    1, 3090, ...,    0,    0,    0],
       [ 135,   15,   97, ...,    0,    0,    0],
       ...,
       [  41,   19,  763, ...,    0,    0,    0],
       [   1,  176,    5, ...,    0,    0,    0],
       [  79, 2181,  765, ...,    0,    0,    0]], dtype=int32)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the updated LSTM model with more layers
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),

    # First LSTM Layer (returns sequences to next layer)
    LSTM(128, return_sequences=True),
    Dropout(0.3),

    # Second LSTM Layer (returns sequences to next layer)
    LSTM(64, return_sequences=True),
    Dropout(0.3),

    # Third LSTM Layer (final LSTM)
    LSTM(32),
    Dropout(0.3),

    # Dense layers
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])



In [18]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
# Train the model
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=3,
    batch_size=64
)

Epoch 1/3
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m727s[0m 506ms/step - accuracy: 0.2008 - loss: 1.6103 - val_accuracy: 0.2132 - val_loss: 1.6041
Epoch 2/3
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m722s[0m 492ms/step - accuracy: 0.2186 - loss: 1.6028 - val_accuracy: 0.2369 - val_loss: 1.5965
Epoch 3/3
[1m1438/1438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m740s[0m 490ms/step - accuracy: 0.2319 - loss: 1.6009 - val_accuracy: 0.2417 - val_loss: 1.5972


In [25]:
model.summary()

In [22]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [23]:
# Predict classes (convert probabilities to class labels)
y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)



[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 92ms/step


In [24]:
#Classification Report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.70      0.36      4510
           1       0.00      0.00      0.00      4638
           2       0.30      0.00      0.00      4728
           3       0.24      0.53      0.33      4531
           4       0.00      0.00      0.00      4593

    accuracy                           0.24     23000
   macro avg       0.16      0.25      0.14     23000
weighted avg       0.16      0.24      0.14     23000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
