<a href="https://colab.research.google.com/github/rutuja-patil24/CMPE-258-Deep_Learning/blob/main/Assignment_05/03_Advance_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# âœ… Install required libraries
!pip install --quiet tensorflow datasets transformers scikit-learn

import tensorflow as tf
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

# âœ… Load IMDb Dataset
dataset = load_dataset("imdb")

# âœ… Convert dataset to Pandas DataFrame
df = pd.DataFrame(dataset['train'])
df = df[['text', 'label']]
df['label'] = df['label'].astype(int)

# âœ… Adding a Second Label: Review Length Category
df['length_category'] = df['text'].apply(lambda x: 0 if len(x) < 100 else (1 if len(x) < 300 else 2))

# âœ… Load Tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# âœ… Tokenization Function
def tokenize_data(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=128, return_tensors="tf")

# âœ… Split Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].values, df[['label', 'length_category']].values, test_size=0.2, random_state=42
)

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

# âœ… Convert labels into NumPy arrays
train_sentiment_labels = np.array([label[0] for label in train_labels])  # Sentiment (0/1)
train_length_labels = np.array([label[1] for label in train_labels])  # Length Category (0/1/2)

val_sentiment_labels = np.array([label[0] for label in val_labels])
val_length_labels = np.array([label[1] for label in val_labels])

# âœ… Reduce Batch Size to Prevent OOM
BATCH_SIZE = 8

# âœ… Convert dataset into TensorFlow format
train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    {"sentiment_output": train_sentiment_labels, "length_output": train_length_labels}
)).shuffle(len(train_texts)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
    {"sentiment_output": val_sentiment_labels, "length_output": val_length_labels}
)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# âœ… Load RoBERTa Transformer Model (without classification head)
roberta_model = TFAutoModel.from_pretrained("roberta-base", return_dict=True)

# âœ… Define the Custom Keras Model
def build_advanced_model():
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

    # âœ… Pass inputs to RoBERTa
    roberta_outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask)

    # âœ… Extract the CLS token embedding
    cls_token = tf.keras.layers.Lambda(lambda x: x.last_hidden_state[:, 0, :])(roberta_outputs)

    # âœ… Fully Connected Layers for Multi-Label Classification
    dense = tf.keras.layers.Dense(128, activation="relu")(cls_token)
    dropout = tf.keras.layers.Dropout(0.3)(dense)

    # ðŸ”¹ Two Outputs (Sentiment Prediction + Review Length Prediction)
    sentiment_output = tf.keras.layers.Dense(1, activation="sigmoid", name="sentiment_output")(dropout)
    length_output = tf.keras.layers.Dense(3, activation="softmax", name="length_output")(dropout)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[sentiment_output, length_output])
    return model

# âœ… Compile Model
multi_label_model = build_advanced_model()
multi_label_model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=2e-5, weight_decay=0.01),
    loss={"sentiment_output": "binary_crossentropy", "length_output": "sparse_categorical_crossentropy"},
    metrics={"sentiment_output": "accuracy", "length_output": "accuracy"}
)

# âœ… Train Model
history = multi_label_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1  # Reduce epochs for quick training
)

# âœ… Evaluate Model
results = multi_label_model.evaluate(val_dataset)
test_loss, sentiment_loss, length_loss, sentiment_acc, length_acc = results

print(f"\nâœ… Test Loss: {test_loss:.4f}")
print(f"âœ… Sentiment Accuracy: {sentiment_acc:.4f} | Review Length Accuracy: {length_acc:.4f}")

# âœ… Function to Predict and Visualize
def predict_movie_review(text_samples):
    encodings = tokenizer(text_samples, padding="max_length", truncation=True, max_length=128, return_tensors="tf")

    inputs = {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"]
    }

    # âœ… Make Predictions
    predictions = multi_label_model.predict(inputs)

    sentiment_preds = predictions[0].flatten()
    length_preds = np.argmax(predictions[1], axis=1)

    for text, sentiment, length in zip(text_samples, sentiment_preds, length_preds):
        sentiment_label = "Positive" if sentiment > 0.5 else "Negative"
        length_label = ["Short", "Medium", "Long"][length]
        print(f"ðŸŽ¬ Review: {text[:100]}... â†’ **Sentiment: {sentiment_label}, Length: {length_label}**")

# âœ… Example Predictions
sample_reviews = [
    "Absolutely loved this movie! The cinematography and acting were top-notch!",
    "Worst movie ever. Poor script and terrible acting.",
    "It was okay, not the best but watchable."
]

predict_movie_review(sample_reviews)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['roberta.embeddings.position_ids', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe


âœ… Test Loss: 0.3439
âœ… Sentiment Accuracy: 0.8816 | Review Length Accuracy: 0.9884
ðŸŽ¬ Review: Absolutely loved this movie! The cinematography and acting were top-notch!... â†’ **Sentiment: Positive, Length: Medium**
ðŸŽ¬ Review: Worst movie ever. Poor script and terrible acting.... â†’ **Sentiment: Negative, Length: Medium**
ðŸŽ¬ Review: It was okay, not the best but watchable.... â†’ **Sentiment: Positive, Length: Medium**
