<a href="https://colab.research.google.com/github/shambhavi1709/Emotion-Aware-Chat-Agent/blob/main/EMOTIONAL_RAG_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
from datasets import load_dataset

dataset = load_dataset("go_emotions")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [26]:
# from datasets.fingerprint import random

# label_names = dataset["train"].features["labels"].feature.names

# def get_emotions(row):
#   print("Text:", row['text'])
#   print("Emotion:", label_names[row['labels'][0]])

# n = random.randint(0, len(dataset['train']))
# get_emotions(dataset['train'][n])

In [27]:
labels = dataset["train"].features["labels"].feature.names

id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

In [28]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [29]:
def tokenize(batch):
    tokenized = tokenizer(batch["text"], truncation=True)
    tokenized["labels"] = batch["labels"]   # keep labels
    return tokenized

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5427
    })
})

In [30]:
num_labels = 28

In [31]:
import torch

def convert_labels_to_multi_hot(examples):
    batch_multi_hot_labels = []
    for labels_list in examples["labels"]:
        multi_hot_vector = [0.0] * num_labels
        for label_id in labels_list:
            if label_id < num_labels:
                multi_hot_vector[label_id] = 1.0
        batch_multi_hot_labels.append(multi_hot_vector)
    return {"labels": torch.tensor(batch_multi_hot_labels, dtype=torch.float32)}

# Apply the conversion to the entire tokenized_dataset
tokenized_dataset = tokenized_dataset.map(convert_labels_to_multi_hot, batched=True)

# Set the format of the dataset to PyTorch tensors and specify columns/types
tokenized_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels'],
    output_all_columns=False
)

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

In [32]:
from transformers import DataCollatorWithPadding

# Define a custom DataCollator to ensure labels are always float32
class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        # Call the base DataCollatorWithPadding to handle tokenized inputs
        batch = super().__call__(features)
        # Explicitly cast labels to float32 if they exist in the batch
        if "labels" in batch:
            batch["labels"] = batch["labels"].to(torch.float32)
        return batch

# Instantiate the custom data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

In [33]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
import numpy as np
import torch
import evaluate

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Convert logits → probabilities → binary predictions
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()

    # Convert references to int
    labels = labels.astype(int)

    # Flatten for macro/micro averaging
    preds_flat = preds.reshape(-1)
    labels_flat = labels.reshape(-1)

    return {
        "f1_macro": metric.compute(
            predictions=preds_flat,
            references=labels_flat,
            average="macro",
        )["f1"],
        "f1_micro": metric.compute(
            predictions=preds_flat,
            references=labels_flat,
            average="micro",
        )["f1"],
    }


In [35]:
# !pip install evaluate
# import evaluate
# import numpy as np

# metric = evaluate.load("f1")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
#     # Cast references to int type, as required by the f1 metric for multi-label classification
#     references = labels.int().numpy()
#     return metric.compute(predictions=preds, references=references, average="macro")

In [36]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="emotion_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    report_to="none"
)

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro
1,0.0962,0.089331,0.758685,0.96913
2,0.0824,0.085367,0.770399,0.969828


TrainOutput(global_step=5428, training_loss=0.10364314186810217, metrics={'train_runtime': 462.6581, 'train_samples_per_second': 187.655, 'train_steps_per_second': 11.732, 'total_flos': 770469756330528.0, 'train_loss': 0.10364314186810217, 'epoch': 2.0})

In [38]:
trainer.save_model("emotion_classifier")
tokenizer.save_pretrained("emotion_classifier")


('emotion_classifier/tokenizer_config.json',
 'emotion_classifier/special_tokens_map.json',
 'emotion_classifier/vocab.txt',
 'emotion_classifier/added_tokens.json',
 'emotion_classifier/tokenizer.json')

In [39]:
from transformers import TextClassificationPipeline

inference_pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    function_to_apply="sigmoid"
)


Device set to use cuda:0


In [40]:
text = "I am feeling very low today, nothing seems to work."
preds = inference_pipeline(text)

sorted(preds[0], key=lambda x: x['score'], reverse=True)[:5]


[{'label': 'neutral', 'score': 0.2845916152000427},
 {'label': 'disappointment', 'score': 0.26525941491127014},
 {'label': 'sadness', 'score': 0.19944733381271362},
 {'label': 'disapproval', 'score': 0.1925116926431656},
 {'label': 'annoyance', 'score': 0.05946832150220871}]