### Model 2: BART-Base Chatbot with RoBERTa Emotion Detection

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    TrainingArguments,
    Trainer
)
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
dataset = load_dataset('facebook/empathetic_dialogues')

In [3]:
# dataset = load_dataset("empathetic_dialogues")
train_dataset = Dataset.from_file("./empathetic_dialogues-train.arrow")

val_dataset = Dataset.from_file("./empathetic_dialogues-validation.arrow")

test_dataset = Dataset.from_file("./empathetic_dialogues-test.arrow")

In [4]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [5]:
train_dataset

Dataset({
    features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
    num_rows: 76673
})

In [12]:
train_dataset[0:5]

{'conv_id': ['hit:0_conv:1',
  'hit:0_conv:1',
  'hit:0_conv:1',
  'hit:0_conv:1',
  'hit:0_conv:1'],
 'utterance_idx': [1, 2, 3, 4, 5],
 'context': ['sentimental',
  'sentimental',
  'sentimental',
  'sentimental',
  'sentimental'],
 'prompt': ['I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.'],
 'speaker_idx': [1, 0, 1, 0, 1],
 'utterance': ['I remember going to see the fireworks with my 

In [4]:
def process_empathetic_dataset(dataset_split, max_turns=3):
    import pandas as pd

    def clean_text(text):
        return text.replace('_comma_', ',').replace('_period_', '.').replace('_exclamation_', '!').strip()

    df = pd.DataFrame(dataset_split)
    df = df.sort_values(by=['conv_id', 'utterance_idx']).reset_index(drop=True)

    pairs = []
    for conv_id, conv in df.groupby('conv_id'):
        history = []
        for _, row in conv.iterrows():
            utterance = clean_text(row['utterance'])
            emotion = row['context']
            speaker = row['speaker_idx']

            if history:
                truncated_history = history[-max_turns:]
                history_str = " ".join(truncated_history)
                input_text = f"<emotion={emotion}> {history_str}"
                pairs.append({
                    "input_bart": input_text,
                    "response": utterance,
                    "emotion": emotion
                })
            history.append(f"[Speaker {speaker}] {utterance}")

    return Dataset.from_pandas(pd.DataFrame(pairs))

In [None]:
# Applying to each split
dataset_processed = DatasetDict({
    "train": process_empathetic_dataset(dataset['train']),
    "validation": process_empathetic_dataset(dataset['validation']),
    "test": process_empathetic_dataset(dataset['test'])
})

In [20]:
dataset_processed['train'][1]

{'input_bart': '<emotion=sentimental> [Speaker 1] I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world. [Speaker 0] Was this a friend you were in love with, or just a best friend?',
 'response': 'This was a best friend. I miss her.',
 'emotion': 'sentimental'}

In [10]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [23]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Add emotion tokens
emotions = list(set(dataset_processed["train"]["emotion"]))
special_tokens = [f"<emotion={e}>" for e in emotions]
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))


Embedding(50297, 768)

In [None]:
# Tokenizing the dataset
def tokenize_fn(example):
    model_inputs = tokenizer(
        example["input_bart"],
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["response"],
            max_length=64,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [26]:
tokenized_dataset = dataset_processed.map(tokenize_fn, batched=True)

Map:   0%|          | 0/58829 [00:00<?, ? examples/s]

Map:   0%|          | 0/9267 [00:00<?, ? examples/s]

Map:   0%|          | 0/8401 [00:00<?, ? examples/s]

In [28]:
tokenized_dataset['train'][0]

{'input_bart': '<emotion=sentimental> [Speaker 1] I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.',
 'response': 'Was this a friend you were in love with, or just a best friend?',
 'emotion': 'sentimental',
 'input_ids': [0,
  50283,
  10975,
  29235,
  4218,
  112,
  742,
  38,
  2145,
  164,
  7,
  192,
  5,
  10756,
  19,
  127,
  275,
  1441,
  4,
  85,
  21,
  5,
  78,
  86,
  52,
  655,
  1240,
  86,
  1937,
  561,
  4,
  2223,
  89,
  21,
  10,
  319,
  9,
  82,
  6,
  52,
  1299,
  101,
  5,
  129,
  82,
  11,
  5,
  232,
  4,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [33]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_bart', 'response', 'emotion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 58829
    })
    validation: Dataset({
        features: ['input_bart', 'response', 'emotion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9267
    })
    test: Dataset({
        features: ['input_bart', 'response', 'emotion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8401
    })
})

In [34]:
columns_to_remove = ["input_bart", "response", "emotion"]

tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)

In [35]:
tokenized_dataset['train'][0]

{'input_ids': [0,
  50283,
  10975,
  29235,
  4218,
  112,
  742,
  38,
  2145,
  164,
  7,
  192,
  5,
  10756,
  19,
  127,
  275,
  1441,
  4,
  85,
  21,
  5,
  78,
  86,
  52,
  655,
  1240,
  86,
  1937,
  561,
  4,
  2223,
  89,
  21,
  10,
  319,
  9,
  82,
  6,
  52,
  1299,
  101,
  5,
  129,
  82,
  11,
  5,
  232,
  4,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [38]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [39]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_empathetic",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    predict_with_generate=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [40]:
trainer.train()

***** Running training *****
  Num examples = 58829
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 22062


Epoch,Training Loss,Validation Loss
1,0.8347,0.791759
2,0.6994,0.778504
3,0.6349,0.773866


***** Running Evaluation *****
  Num examples = 9267
  Batch size = 8
Saving model checkpoint to ./bart_empathetic/checkpoint-7354
Configuration saved in ./bart_empathetic/checkpoint-7354/config.json
Model weights saved in ./bart_empathetic/checkpoint-7354/pytorch_model.bin
tokenizer config file saved in ./bart_empathetic/checkpoint-7354/tokenizer_config.json
Special tokens file saved in ./bart_empathetic/checkpoint-7354/special_tokens_map.json
added tokens file saved in ./bart_empathetic/checkpoint-7354/added_tokens.json
***** Running Evaluation *****
  Num examples = 9267
  Batch size = 8
Saving model checkpoint to ./bart_empathetic/checkpoint-14708
Configuration saved in ./bart_empathetic/checkpoint-14708/config.json
Model weights saved in ./bart_empathetic/checkpoint-14708/pytorch_model.bin
tokenizer config file saved in ./bart_empathetic/checkpoint-14708/tokenizer_config.json
Special tokens file saved in ./bart_empathetic/checkpoint-14708/special_tokens_map.json
added tokens file 

TrainOutput(global_step=22062, training_loss=0.7229934566505757, metrics={'train_runtime': 1232.3637, 'train_samples_per_second': 143.21, 'train_steps_per_second': 17.902, 'total_flos': 1.345131978817536e+16, 'train_loss': 0.7229934566505757, 'epoch': 3.0})

In [41]:
trainer.save_model("./bart_empathetic_final")
tokenizer.save_pretrained("./bart_empathetic_final")

Saving model checkpoint to ./bart_empathetic_final
Configuration saved in ./bart_empathetic_final/config.json
Model weights saved in ./bart_empathetic_final/pytorch_model.bin
tokenizer config file saved in ./bart_empathetic_final/tokenizer_config.json
Special tokens file saved in ./bart_empathetic_final/special_tokens_map.json
added tokens file saved in ./bart_empathetic_final/added_tokens.json
tokenizer config file saved in ./bart_empathetic_final/tokenizer_config.json
Special tokens file saved in ./bart_empathetic_final/special_tokens_map.json
added tokens file saved in ./bart_empathetic_final/added_tokens.json


('./bart_empathetic_final/tokenizer_config.json',
 './bart_empathetic_final/special_tokens_map.json',
 './bart_empathetic_final/vocab.json',
 './bart_empathetic_final/merges.txt',
 './bart_empathetic_final/added_tokens.json')

In [None]:
# Loading emotion classifier for inference only
from transformers import RobertaForSequenceClassification,RobertaTokenizer
import torch.nn.functional as F

# 1a) Load a pretrained emotion detector
emo_tokenizer = RobertaTokenizer.from_pretrained("./rob-large-emotion-detector_dedupe/")
emo_model     = RobertaForSequenceClassification.from_pretrained("./rob-large-emotion-detector_dedupe/")
emo_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [6]:
def detect_emotion(text: str) -> str:
    inputs = emo_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(device)
    
    logits = emo_model(**inputs).logits
    probs  = F.softmax(logits, dim=-1)
    idx    = probs.argmax(dim=-1).item()
    
    return id_to_emotion[idx]

In [11]:
model_path = "./Chatbot Training copy/bart_empathetic_final/"
tokenizer_final = BartTokenizer.from_pretrained(model_path)
model_final = BartForConditionalGeneration.from_pretrained(model_path)

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tox_tokenizer = AutoTokenizer.from_pretrained("sentinet/suicidality")
tox_model = AutoModelForSequenceClassification.from_pretrained("sentinet/suicidality")

In [13]:
emotions = [
    'jealous', 'furious', 'disgusted', 'nostalgic', 'impressed', 'faithful',
    'caring', 'confident', 'guilty', 'angry', 'disappointed', 'sentimental',
    'anxious', 'annoyed', 'embarrassed', 'terrified', 'apprehensive', 'grateful',
    'sad', 'afraid', 'ashamed', 'devastated', 'joyful', 'hopeful', 'lonely',
    'prepared', 'trusting', 'anticipating', 'excited', 'surprised', 'content', 'proud'
]

emotion_to_id = {emotion: idx for idx, emotion in enumerate(emotions)}
id_to_emotion = {idx: emotion for emotion, idx in emotion_to_id.items()}


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Chat function that tracks context and generates a response based on previous conversation
def chat(prompt, emotion, conversation_history=None):
    if conversation_history is None:
        conversation_history = ""
    
    # Formatting the input
    input_text = f"<emotion={emotion}> {conversation_history} [Speaker 0] {prompt}"
    
    # Tokenizing the input text
    inputs = tokenizer_final(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    # Generate a response using the model
    outputs = model_final.generate(
        **inputs, 
        max_length=64, 
        num_beams=5, 
        early_stopping=True
    )
    
    # Decode the output and skip special tokens
    response = tokenizer_final.decode(outputs[0], skip_special_tokens=True)
    
    # Adding the prompt and response to the conversation history
    conversation_history += f"[Speaker 0] {prompt} [Speaker 1] {response} "
    
    return response, conversation_history

In [22]:
# Example conversation loop
conversation_history = ""
while True:
    prompt = input("You: ")
        # Allow the user to exit the chat by typing 'exit'
    if prompt.lower() == "exit":
        print("Bot: Goodbye! Take care!")
        break
    emotion = detect_emotion(prompt)
    response, conversation_history = chat(prompt, emotion, conversation_history)
    print(f"Bot: {response}")

You:  I feel really lonely. What should I do ?


Bot: I'm sorry to hear that. What are you going to do?


You:  I don't know. Can you help me ?


Bot: I'm sure you can.


You:  No I meant can you do something ?


Bot: I'm sure you can.


You:  /exit


Bot: I'm sorry to hear that. I hope you feel better soon.


You:  exit


Bot: Goodbye! Take care!


#### Evaluation

In [23]:
import evaluate

In [24]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [25]:
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [None]:
def test_loop(model, tokenizer, test_dataset, data_collator):
    dataloader = DataLoader(test_dataset, batch_size=4, collate_fn=data_collator)
    model.eval()
    # model.to(device)

    all_preds = []
    all_labels = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
                num_beams=2,
            )

        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        labels = batch["labels"]
    
        labels_with_pad = torch.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels_with_pad, skip_special_tokens=True)

        all_preds.extend(preds)
        all_labels.extend(decoded_labels)

    # ROUGE
    rouge_score = rouge.compute(predictions=all_preds, references=all_labels, use_stemmer=True)

    # BLEU
    bleu_score = sacrebleu.compute(predictions=all_preds, references=[[ref] for ref in all_labels])

    # BERTScore
    bert_score = bertscore.compute(predictions=all_preds, references=all_labels, lang="en")
    bertscore_precision = np.mean(bert_score["precision"])
    bertscore_recall = np.mean(bert_score["recall"])
    bertscore_f1 = np.mean(bert_score["f1"])

    return {
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "bleu": bleu_score["score"],
        "bertscore_precision": bertscore_precision,
        "bertscore_recall": bertscore_recall,
        "bertscore_f1": bertscore_f1,
    }

In [27]:
from datasets import load_from_disk

In [29]:
tokenized_dataset = load_from_disk("./bart_tokenized_dataset/")

In [28]:
model_final.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50297, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50297, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer_final, model=model_final)

In [31]:
test_results = test_loop(model_final, tokenizer_final,tokenized_dataset['test'], data_collator)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
test_results

{'rouge1': np.float64(0.2000004994188989),
 'rouge2': np.float64(0.05244676667429022),
 'rougeL': np.float64(0.17877279791528627),
 'bleu': 2.5243624410752825,
 'bertscore_precision': np.float64(0.8752496373708752),
 'bertscore_recall': np.float64(0.8629908077050754),
 'bertscore_f1': np.float64(0.8689322973018515)}

In [33]:
print("Rouge 1 score is", test_results['rouge1'] * 100)
print("Rouge 2 score is", test_results['rouge2'] * 100)
print("Rouge L score is", test_results['rougeL'] * 100)
print("BLEU score is", test_results['bleu'])
print("Bertscore Precision is", test_results['bertscore_precision'] * 100)
print("Bertscore Recall is", test_results['bertscore_recall'] * 100)
print("Bertscore F1 is", test_results['bertscore_f1'] * 100)

Rouge 1 score is 20.00004994188989
Rouge 2 score is 5.244676667429022
Rouge L score is 17.87727979152863
BLEU score is 2.5243624410752825
Bertscore Precision is 87.52496373708752
Bertscore Recall is 86.29908077050754
Bertscore F1 is 86.89322973018515
