In [14]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import urllib.request
import zipfile
import os

# Load and preprocess the dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['Review'] = df['Review'].astype(str).fillna('')

    print("Validating 'Rating' column...")
    invalid_rows = df[~df['Rating'].apply(lambda x: isinstance(x, (int, float)) and 1 <= x <= 5)]
    if not invalid_rows.empty:
        print(f"Found {len(invalid_rows)} invalid ratings:")
        print(invalid_rows[['Review', 'Rating']].head())
        print("Removing rows with invalid ratings...")
        df = df[df['Rating'].apply(lambda x: isinstance(x, (int, float)) and 1 <= x <= 5)]

    reviews = df['Review'].values
    ratings = df['Rating'].astype(int).values - 1

    review_lengths = [len(review.split()) for review in reviews]
    print(f"Average review length: {np.mean(review_lengths):.1f}, Max length: {np.max(review_lengths)}")
    print(f"Loaded {len(reviews)} valid reviews with ratings.")
    return reviews, ratings

# Download and extract GloVe embeddings
def download_glove():
    if not os.path.exists(GLOVE_DIR):
        os.makedirs(GLOVE_DIR)
    if not os.path.exists(GLOVE_FILE):
        print("Downloading GloVe embeddings...")
        urllib.request.urlretrieve(GLOVE_URL, 'glove.6B.zip')
        with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
            zip_ref.extractall(GLOVE_DIR)
        os.remove('glove.6B.zip')
        print("GloVe embeddings downloaded and extracted.")

# Load GloVe embeddings
def load_glove_embeddings():
    download_glove()
    embeddings_index = {}
    with open(GLOVE_FILE, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(tokenizer, embeddings_index):
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i < MAX_WORDS:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Preprocess text data
def preprocess_data(reviews, ratings):
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(reviews)
    sequences = tokenizer.texts_to_sequences(reviews)
    padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')

    print("Converting ratings to one-hot encoded format...")
    labels = to_categorical(ratings, num_classes=5)
    return padded_sequences, labels, tokenizer

# Build BiLSTM model with attention
def build_model(embedding_matrix):
    inputs = Input(shape=(MAX_LEN,))
    embedding = Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM,
                         weights=[embedding_matrix], input_length=MAX_LEN, trainable=False)(inputs)
    bilstm1 = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    bilstm2 = Bidirectional(LSTM(64, return_sequences=True))(bilstm1)
    attention = Attention()([bilstm2, bilstm2])
    pooled = tf.keras.layers.GlobalAveragePooling1D()(attention)
    dense = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(pooled)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(5, activation='softmax')(dropout)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [16]:
# Main execution
def main():
    file_path = 'balanced_data_for_DL.csv'  # Update with your file path
    reviews, ratings = load_data(file_path)

    if len(reviews) == 0:
        raise ValueError("No valid data remains after cleaning. Please check 'Rating' column for invalid values.")

    print("Computing class weights...")
    class_weights = compute_class_weight('balanced', classes=np.arange(5), y=ratings)
    class_weights = dict(enumerate(class_weights))
    print("Class weights:", class_weights)

    padded_sequences, labels, tokenizer = preprocess_data(reviews, ratings)

    embeddings_index = load_glove_embeddings()
    embedding_matrix = create_embedding_matrix(tokenizer, embeddings_index)

    X_train, X_temp, y_train, y_temp = train_test_split(
        padded_sequences, labels, test_size=0.2, random_state=42
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42
    )

    model = build_model(embedding_matrix)
    model.build(input_shape=(None, MAX_LEN))
    model.summary()

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True
    )
    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping, lr_scheduler],
        class_weight=class_weights,
        verbose=1
    )

    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"\nTest Accuracy: {test_accuracy:.4f}, Test Loss: {test_loss:.4f}")

    from sklearn.metrics import confusion_matrix
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test_classes, y_pred_classes))

if __name__ == '__main__':
    main()

Validating 'Rating' column...
Average review length: 74.5, Max length: 4815
Loaded 115000 valid reviews with ratings.
Computing class weights...
Class weights: {0: np.float64(1.0), 1: np.float64(1.0), 2: np.float64(1.0), 3: np.float64(1.0), 4: np.float64(1.0)}
Converting ratings to one-hot encoded format...




Epoch 1/20
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 22ms/step - accuracy: 0.3121 - loss: 1.9581 - val_accuracy: 0.4110 - val_loss: 1.4141 - learning_rate: 1.0000e-04
Epoch 2/20
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 22ms/step - accuracy: 0.4127 - loss: 1.4004 - val_accuracy: 0.4360 - val_loss: 1.3104 - learning_rate: 1.0000e-04
Epoch 3/20
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 21ms/step - accuracy: 0.4294 - loss: 1.3233 - val_accuracy: 0.4428 - val_loss: 1.2806 - learning_rate: 1.0000e-04
Epoch 4/20
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 22ms/step - accuracy: 0.4494 - loss: 1.2765 - val_accuracy: 0.4650 - val_loss: 1.2341 - learning_rate: 1.0000e-04
Epoch 5/20
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 21ms/step - accuracy: 0.4582 - loss: 1.2501 - val_accuracy: 0.4610 - val_loss: 1.2354 - learning_rate: 1.0000e-04
Epoch 6/20
[1m2875/2875[0m 