## Empathetic Dialogue System for Mental Health Support

In [None]:
# importing libraries
import torch
from torch.utils.data import DataLoader
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizer,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Loading the dataset
train_dataset = Dataset.from_file("./empathetic_dialogues-train.arrow")

val_dataset = Dataset.from_file("./empathetic_dialogues-validation.arrow")

test_dataset = Dataset.from_file("./empathetic_dialogues-test.arrow")

In [4]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [5]:
dataset.column_names

{'train': ['conv_id',
  'utterance_idx',
  'context',
  'prompt',
  'speaker_idx',
  'utterance',
  'selfeval',
  'tags'],
 'validation': ['conv_id',
  'utterance_idx',
  'context',
  'prompt',
  'speaker_idx',
  'utterance',
  'selfeval',
  'tags'],
 'test': ['conv_id',
  'utterance_idx',
  'context',
  'prompt',
  'speaker_idx',
  'utterance',
  'selfeval',
  'tags']}

In [6]:
dataset['train'][0]

{'conv_id': 'hit:0_conv:1',
 'utterance_idx': 1,
 'context': 'sentimental',
 'prompt': 'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
 'speaker_idx': 1,
 'utterance': 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people_comma_ we felt like the only people in the world.',
 'selfeval': '5|5|5_2|2|5',
 'tags': ''}

In [None]:
def load_empathetic_dialogues():
    # Getting unique emotions from the context column
    emotions = list(set(dataset['train']['context']))
    # Creating mappings
    emotion_to_id = {emotion: idx for idx, emotion in enumerate(emotions)}
    id_to_emotion = {idx: emotion for emotion, idx in emotion_to_id.items()}
    
    def preprocess(example):
        text = example['prompt'].replace("_comma_", ",").replace("_exclamation_", "!").replace("_period_", ".").strip()
        return {
           'text': text,
            'label': emotion_to_id[example['context']],
        }
    
    return dataset.map(preprocess, remove_columns=['conv_id', 'utterance_idx', 'speaker_idx', 'selfeval', 'tags']),emotion_to_id, id_to_emotion

In [8]:
dataset, emotion_to_id, id_to_emotion = load_empathetic_dialogues()

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

In [9]:
dataset['train'][1:10]

{'context': ['sentimental',
  'sentimental',
  'sentimental',
  'sentimental',
  'sentimental',
  'afraid',
  'afraid',
  'afraid',
  'afraid'],
 'prompt': ['I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
  ' i used to scare for darkness',
  ' i used to scare for darkness',
  ' i used to scare for darkness',
  ' i used to scare for darkness'],
 'utterance': ['Was this a friend you were i

In [None]:
def remove_duplicates(dataset):
    df = dataset['train'].to_pandas()
    df = df.drop_duplicates(subset=['text'])
    return DatasetDict({'train': Dataset.from_pandas(df)})

In [11]:
dataset_train = remove_duplicates(dataset)

In [12]:
dataset_train

DatasetDict({
    train: Dataset({
        features: ['context', 'prompt', 'utterance', 'text', 'label', '__index_level_0__'],
        num_rows: 17565
    })
})

In [None]:
# Loading and combining datasets
train_dataset = dataset_train['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

In [14]:
train_dataset

Dataset({
    features: ['context', 'prompt', 'utterance', 'text', 'label', '__index_level_0__'],
    num_rows: 17565
})

In [15]:
emotions = list(set(train_dataset['context']))
print(f"Found {len(emotions)} emotions: {emotions}")

Found 32 emotions: ['jealous', 'furious', 'disgusted', 'nostalgic', 'impressed', 'faithful', 'caring', 'confident', 'guilty', 'angry', 'disappointed', 'sentimental', 'anxious', 'annoyed', 'embarrassed', 'terrified', 'apprehensive', 'grateful', 'sad', 'afraid', 'ashamed', 'devastated', 'joyful', 'hopeful', 'lonely', 'prepared', 'trusting', 'anticipating', 'excited', 'surprised', 'content', 'proud']


In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }


### Emotion Classifier Training

#### 1. Roberta-base model fine-tuning

In [None]:
# Emotion Classifier Training
classifier_model_name = "roberta-base"
classifier_tokenizer = RobertaTokenizer.from_pretrained(classifier_model_name)
classifier_model = RobertaForSequenceClassification.from_pretrained(
    classifier_model_name, num_labels=32
).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [15]:
classifier_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [18]:
# Tokenization
def tokenize_classifier(batch):
    return classifier_tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt' 
    )

In [20]:
train_dataset_clf = train_dataset.rename_column("label", "labels")
val_dataset_clf = val_dataset.rename_column("label", "labels")

In [21]:
train_dataset_clf = train_dataset_clf.map(tokenize_classifier, batched=True)
val_dataset_clf = val_dataset_clf.map(tokenize_classifier, batched=True)

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

NameError: name 'classifier_tokenizer' is not defined

In [21]:
test_dataset_clf = val_dataset.rename_column("label", "labels")
test_dataset_clf = val_dataset_clf.map(tokenize_classifier, batched=True)

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

In [None]:
# Training Setup

training_args = TrainingArguments(
    output_dir='./emotion_classifier',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.1,
    logging_dir='./logs',
    report_to="none",
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    gradient_accumulation_steps=2,  
)


In [24]:
trainer = Trainer(
    model=classifier_model,
    args=training_args,
    train_dataset=train_dataset_clf,
    eval_dataset=val_dataset_clf,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [25]:
# Start Training
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: context, prompt, response, text, utterance. If context, prompt, response, text, utterance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 76673
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 9584


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,1.2973,1.407716,0.580964,0.575242
1,0.7384,1.583053,0.580549,0.576125
2,0.3548,1.811579,0.575478,0.574586
3,0.2145,2.085625,0.563342,0.5629
4,0.1249,2.294411,0.561014,0.560024
5,0.0747,2.456188,0.568163,0.570983
6,0.0524,2.583255,0.572984,0.574035
7,0.0381,2.64691,0.57473,0.576133


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: context, prompt, response, text, utterance. If context, prompt, response, text, utterance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12030
  Batch size = 32
Saving model checkpoint to ./emotion_classifier/checkpoint-1198
Configuration saved in ./emotion_classifier/checkpoint-1198/config.json
Model weights saved in ./emotion_classifier/checkpoint-1198/pytorch_model.bin
Deleting older checkpoint [emotion_classifier/checkpoint-19172] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: context, prompt, response, text, utterance. If context, prompt, response, text, utterance are not expected by `RobertaForSequenceCla

TrainOutput(global_step=9584, training_loss=0.43223734272143277, metrics={'train_runtime': 1922.094, 'train_samples_per_second': 319.123, 'train_steps_per_second': 4.986, 'total_flos': 4.03578298902528e+16, 'train_loss': 0.43223734272143277, 'epoch': 8.0})

In [26]:
val_results = trainer.evaluate()
print(val_results)

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: context, prompt, response, text, utterance. If context, prompt, response, text, utterance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12030
  Batch size = 32


{'eval_loss': 1.4077163934707642, 'eval_accuracy': 0.5809642560266002, 'eval_f1': 0.5752416752697799, 'eval_runtime': 10.8113, 'eval_samples_per_second': 1112.727, 'eval_steps_per_second': 34.779, 'epoch': 8.0}


In [27]:
# Final evaluation
test_results = trainer.evaluate(eval_dataset=test_dataset_clf)
print(test_results)

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: context, prompt, response, text, utterance. If context, prompt, response, text, utterance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12030
  Batch size = 32


{'eval_loss': 1.4077163934707642, 'eval_accuracy': 0.5809642560266002, 'eval_f1': 0.5752416752697799, 'eval_runtime': 11.3499, 'eval_samples_per_second': 1059.917, 'eval_steps_per_second': 33.128, 'epoch': 8.0}


In [30]:
train_dataset_clf.features

{'context': Value(dtype='string', id=None),
 'prompt': Value(dtype='string', id=None),
 'utterance': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'response': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
def test_classifier(texts, model, tokenizer, emotion_id_to_name):
    # Preprocess inputs
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    
    # predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Converting to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = probs.argmax(dim=-1)
    
    # Converting to human-readable labels
    return [{
        "text": texts[i],
        "predicted_emotion": emotion_id_to_name[pred.item()],
        "confidence": probs[i][pred].item()
    } for i, pred in enumerate(predictions)]

In [29]:
emotion_id_to_name = {v: k for k, v in emotion_to_id.items()}
test_texts = [
    "I'm really excited about the upcoming trip!",
    "I feel completely alone in this situation",
    "This constant pressure is making me anxious"
]

results = test_classifier(test_texts, classifier_model, classifier_tokenizer, emotion_id_to_name)
for result in results:
    print(f"Text: {result['text']}")
    print(f"Predicted Emotion: {result['predicted_emotion']} ({result['confidence']:.2%})")
    print()

Text: I'm really excited about the upcoming trip!
Predicted Emotion: excited (66.36%)

Text: I feel completely alone in this situation
Predicted Emotion: lonely (93.42%)

Text: This constant pressure is making me anxious
Predicted Emotion: anxious (76.20%)



In [34]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [32]:
model_save_path = "./roberta-emotion-detector"

classifier_model.save_pretrained(model_save_path)
classifier_tokenizer.save_pretrained(model_save_path)

Configuration saved in ./roberta-emotion-detector/config.json
Model weights saved in ./roberta-emotion-detector/pytorch_model.bin
tokenizer config file saved in ./roberta-emotion-detector/tokenizer_config.json
Special tokens file saved in ./roberta-emotion-detector/special_tokens_map.json


('./roberta-emotion-detector/tokenizer_config.json',
 './roberta-emotion-detector/special_tokens_map.json',
 './roberta-emotion-detector/vocab.json',
 './roberta-emotion-detector/merges.txt',
 './roberta-emotion-detector/added_tokens.json')

In [34]:
torch.save(classifier_model.state_dict(), './emotion_detection_model.pt')

#### 2. Roberta Detection Dedup

In [60]:
base_model_name = "roberta-base"
base_tokenizer = RobertaTokenizer.from_pretrained(base_model_name)
base_model = RobertaForSequenceClassification.from_pretrained(
    base_model_name, num_labels=32
).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [61]:
base_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [74]:
# Tokenization
def tokenize_classifier_base(batch):
    return base_tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt' 
    )

In [66]:
train_dataset_base = train_dataset.rename_column("label", "labels")
val_dataset_base = val_dataset.rename_column("label", "labels")

In [67]:
train_dataset_base = train_dataset_base.map(tokenize_classifier_base, batched=True)
val_dataset_base = val_dataset_base.map(tokenize_classifier_base, batched=True)

Map:   0%|          | 0/17565 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

In [68]:
print(train_dataset_base)

Dataset({
    features: ['context', 'prompt', 'utterance', 'text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 17565
})


In [69]:
test_dataset_base = test_dataset.rename_column("label", "labels")
test_dataset_base = test_dataset_base.map(tokenize_classifier_base, batched=True)

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

In [None]:
# Training Setup
training_args_base = TrainingArguments(
    output_dir='./emotion_classifier',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.1,
    logging_dir='./logs',
    report_to="none",
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    gradient_accumulation_steps=2,  
)


PyTorch: setting up devices


In [76]:
trainer_base = Trainer(
    model=base_model,
    args=training_args_base,
    train_dataset=train_dataset_base,
    eval_dataset=val_dataset_base,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [77]:
# Start Training
trainer_base.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, prompt, utterance, context, text. If __index_level_0__, prompt, utterance, context, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 17565
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 4392


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7722,1.42017,0.588529,0.582713
2,0.6565,1.45433,0.587947,0.583552
3,0.5573,1.50003,0.590441,0.586751
4,0.4856,1.511049,0.588113,0.585663


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, utterance, prompt, context. If text, utterance, prompt, context are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12030
  Batch size = 8
Saving model checkpoint to ./emotion_classifier/checkpoint-1098
Configuration saved in ./emotion_classifier/checkpoint-1098/config.json
Model weights saved in ./emotion_classifier/checkpoint-1098/pytorch_model.bin
Deleting older checkpoint [emotion_classifier/checkpoint-2196] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, utterance, prompt, context. If text, utterance, prompt, context are not expected by `RobertaForSequenceClassification.forward`,  you can safely igno

TrainOutput(global_step=4392, training_loss=0.6231915937746809, metrics={'train_runtime': 606.5614, 'train_samples_per_second': 115.833, 'train_steps_per_second': 7.241, 'total_flos': 4622790537216000.0, 'train_loss': 0.6231915937746809, 'epoch': 4.0})

In [81]:
# Final evaluation
test_results_base = trainer_base.evaluate(eval_dataset=test_dataset_base)
print(test_results_base)

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, utterance, prompt, context. If text, utterance, prompt, context are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10943
  Batch size = 8


{'eval_loss': 1.3745455741882324, 'eval_accuracy': 0.5900575710499862, 'eval_f1': 0.5810275450146881, 'eval_runtime': 12.6746, 'eval_samples_per_second': 863.38, 'eval_steps_per_second': 107.932, 'epoch': 4.0}


In [None]:
def test_classifier(texts, model, tokenizer, emotion_id_to_name):
    # Preprocess inputs
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    
    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Converting to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = probs.argmax(dim=-1)
    
    # Converting to human-readable labels
    return [{
        "text": texts[i],
        "predicted_emotion": emotion_id_to_name[pred.item()],
        "confidence": probs[i][pred].item()
    } for i, pred in enumerate(predictions)]

In [83]:
emotion_id_to_name = {v: k for k, v in emotion_to_id.items()}
test_texts = [
    "I'm really excited about the upcoming trip!",
    "I feel completely alone in this situation",
    "This constant pressure is making me anxious"
]

results = test_classifier(test_texts, base_model, base_tokenizer, emotion_id_to_name)
for result in results:
    print(f"Text: {result['text']}")
    print(f"Predicted Emotion: {result['predicted_emotion']} ({result['confidence']:.2%})")
    print()

Text: I'm really excited about the upcoming trip!
Predicted Emotion: excited (51.74%)

Text: I feel completely alone in this situation
Predicted Emotion: lonely (98.61%)

Text: This constant pressure is making me anxious
Predicted Emotion: anxious (87.75%)



In [84]:
model_save_path = "./roberta-base-deduplicate-emotion-detector"

base_model.save_pretrained(model_save_path)
base_tokenizer.save_pretrained(model_save_path)

Configuration saved in ./roberta-base-deduplicate-emotion-detector/config.json
Model weights saved in ./roberta-base-deduplicate-emotion-detector/pytorch_model.bin
tokenizer config file saved in ./roberta-base-deduplicate-emotion-detector/tokenizer_config.json
Special tokens file saved in ./roberta-base-deduplicate-emotion-detector/special_tokens_map.json


('./roberta-base-deduplicate-emotion-detector/tokenizer_config.json',
 './roberta-base-deduplicate-emotion-detector/special_tokens_map.json',
 './roberta-base-deduplicate-emotion-detector/vocab.json',
 './roberta-base-deduplicate-emotion-detector/merges.txt',
 './roberta-base-deduplicate-emotion-detector/added_tokens.json')

In [85]:
torch.save(base_model.state_dict(), './emotion_detection_model_dedup.pt')

#### 3. Distilbert-base fine tuning training

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [14]:
distil_model_name = 'distilbert-base-cased'
distil_model = DistilBertForSequenceClassification.from_pretrained(distil_model_name, num_labels=32)
distil_tokenizer=DistilBertTokenizer.from_pretrained(distil_model_name)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier

In [15]:
# Tokenization
def tokenize_distil(batch):
    return distil_tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt' 
    )

In [16]:
train_dataset_dist = train_dataset.rename_column("label", "labels")
val_dataset_dist = val_dataset.rename_column("label", "labels")
train_dataset_dist = train_dataset_dist.map(tokenize_distil, batched=True)
val_dataset_dist = val_dataset_dist.map(tokenize_distil, batched=True)

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

In [17]:
test_dataset_dist = test_dataset.rename_column("label", "labels")
test_dataset_dist = test_dataset_dist.map(tokenize_distil, batched=True)

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

In [None]:
training_args_distil = TrainingArguments(
    output_dir='./emotion_classifier_distil',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=False,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.1,
    logging_dir='./logs',
    report_to="none",
    load_best_model_at_end=True,
)

In [20]:
trainer_distil = Trainer(
    model=distil_model,
    args=training_args_distil,
    train_dataset=train_dataset_dist,
    eval_dataset=val_dataset_dist,
    compute_metrics=compute_metrics
)

In [21]:
trainer_distil.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: prompt, response, context, utterance, text. If prompt, response, context, utterance, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 76673
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 47925


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5974,1.809371,0.531754,0.532197
2,0.1741,2.881241,0.521696,0.522167
3,0.0574,3.802748,0.527348,0.527605
4,0.0459,3.966832,0.521862,0.521402
5,0.0141,4.018108,0.528845,0.527671


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: prompt, response, context, utterance, text. If prompt, response, context, utterance, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12030
  Batch size = 8
Saving model checkpoint to ./emotion_classifier_distil/checkpoint-9585
Configuration saved in ./emotion_classifier_distil/checkpoint-9585/config.json
Model weights saved in ./emotion_classifier_distil/checkpoint-9585/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: prompt, response, context, utterance, text. If prompt, response, context, utterance, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
**

TrainOutput(global_step=47925, training_loss=0.32992685496091967, metrics={'train_runtime': 1608.4458, 'train_samples_per_second': 238.345, 'train_steps_per_second': 29.796, 'total_flos': 1.270263344050176e+16, 'train_loss': 0.32992685496091967, 'epoch': 5.0})

In [22]:
# Final evaluation
test_results_dist = trainer_distil.evaluate(eval_dataset=test_dataset_dist)
print(test_results_dist)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: prompt, response, context, utterance, text. If prompt, response, context, utterance, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10943
  Batch size = 8


{'eval_loss': 1.8682000637054443, 'eval_accuracy': 0.5225258155898748, 'eval_f1': 0.5224260241003112, 'eval_runtime': 7.4766, 'eval_samples_per_second': 1463.636, 'eval_steps_per_second': 182.971, 'epoch': 5.0}


In [23]:
torch.save(distil_model.state_dict(), './emotion_detection_model_distil.pt')

In [24]:
model_save_path = "./distil-emotion-detector"

distil_model.save_pretrained(model_save_path)
distil_tokenizer.save_pretrained(model_save_path)

Configuration saved in ./distil-emotion-detector/config.json
Model weights saved in ./distil-emotion-detector/pytorch_model.bin
tokenizer config file saved in ./distil-emotion-detector/tokenizer_config.json
Special tokens file saved in ./distil-emotion-detector/special_tokens_map.json


('./distil-emotion-detector/tokenizer_config.json',
 './distil-emotion-detector/special_tokens_map.json',
 './distil-emotion-detector/vocab.txt',
 './distil-emotion-detector/added_tokens.json')

#### 4. Distilbert-base fine tuning training- dedupe

In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

In [89]:
dist_model_name = 'distilbert-base-uncased'
dist_model = DistilBertForSequenceClassification.from_pretrained(dist_model_name, num_labels=32)
dist_tokenizer=DistilBertTokenizerFast.from_pretrained(dist_model_name)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /user/siyer8/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_2

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json in cache at /user/siyer8/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
creating metadata file for /user/siyer8/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /user/siyer8/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /user/siyer8/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df

In [90]:
# Tokenization
def tokenize_dist(batch):
    return dist_tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt' 
    )

In [92]:
train_dataset_distil

Dataset({
    features: ['context', 'prompt', 'utterance', 'text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 17565
})

In [91]:
train_dataset_distil = train_dataset.rename_column("label", "labels")
val_dataset_distil = val_dataset.rename_column("label", "labels")
train_dataset_distil = train_dataset_distil.map(tokenize_dist, batched=True)
val_dataset_distil = val_dataset_distil.map(tokenize_dist, batched=True)

Map:   0%|          | 0/17565 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

In [93]:
test_dataset_distil = test_dataset.rename_column("label", "labels")
test_dataset_distil = test_dataset_distil.map(tokenize_dist, batched=True)

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

In [94]:
training_args_dist = TrainingArguments(
    output_dir='./emotion_classifier_distil',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=False,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.1,
    logging_dir='./logs',
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    gradient_accumulation_steps=2
)

PyTorch: setting up devices


In [95]:
trainer_dist = Trainer(
    model=dist_model,
    args=training_args_dist,
    train_dataset=train_dataset_distil,
    eval_dataset=val_dataset_distil,
    compute_metrics=compute_metrics
)

In [96]:
trainer_dist.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, prompt, utterance, context, text. If __index_level_0__, prompt, utterance, context, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 17565
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 5490


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.0248,1.767966,0.491521,0.474235
2,1.4181,1.549787,0.525603,0.509233
3,1.1431,1.512816,0.543641,0.532504
4,0.9389,1.522292,0.551455,0.545881
5,0.8266,1.531215,0.55054,0.544618


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, utterance, prompt, context. If text, utterance, prompt, context are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12030
  Batch size = 8
Saving model checkpoint to ./emotion_classifier_distil/checkpoint-1098
Configuration saved in ./emotion_classifier_distil/checkpoint-1098/config.json
Model weights saved in ./emotion_classifier_distil/checkpoint-1098/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, utterance, prompt, context. If text, utterance, prompt, context are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examp

TrainOutput(global_step=5490, training_loss=1.3508777833810053, metrics={'train_runtime': 430.989, 'train_samples_per_second': 203.775, 'train_steps_per_second': 12.738, 'total_flos': 2910043384012800.0, 'train_loss': 1.3508777833810053, 'epoch': 5.0})

In [None]:
# Final evaluation
test_results_distil = trainer_dist.evaluate(eval_dataset=test_dataset_distil)
print(test_results_distil)

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, utterance, prompt, context. If text, utterance, prompt, context are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10943
  Batch size = 8


{'eval_loss': 1.4878315925598145, 'eval_accuracy': 0.5438179658229005, 'eval_f1': 0.5301760790082249, 'eval_runtime': 8.8527, 'eval_samples_per_second': 1236.125, 'eval_steps_per_second': 154.53, 'epoch': 5.0}


In [98]:
torch.save(dist_model.state_dict(), './emotion_detection_model_dist_dedup.pt')

In [99]:
model_save_path = "./distil-emotion-detector-dedup"

dist_model.save_pretrained(model_save_path)
dist_tokenizer.save_pretrained(model_save_path)

Configuration saved in ./distil-emotion-detector-dedup/config.json
Model weights saved in ./distil-emotion-detector-dedup/pytorch_model.bin
tokenizer config file saved in ./distil-emotion-detector-dedup/tokenizer_config.json
Special tokens file saved in ./distil-emotion-detector-dedup/special_tokens_map.json


('./distil-emotion-detector-dedup/tokenizer_config.json',
 './distil-emotion-detector-dedup/special_tokens_map.json',
 './distil-emotion-detector-dedup/vocab.txt',
 './distil-emotion-detector-dedup/added_tokens.json',
 './distil-emotion-detector-dedup/tokenizer.json')

#### 5. Roberta Large- Dedupe

In [None]:
# importing libraries
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [34]:
rob_model_name = 'roberta-large'
rob_model = RobertaForSequenceClassification.from_pretrained(rob_model_name, num_labels=32)
rob_tokenizer=RobertaTokenizer.from_pretrained(rob_model_name)

loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /user/siyer8/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_2

In [35]:
def tokenize_roberta(batch):
    return rob_tokenizer(
        batch['text'],  
        padding='max_length',
        truncation=True,
        max_length=128,  
        return_tensors="pt"
    )

In [36]:
train_dataset_roberta = train_dataset.rename_column("label", "labels")
val_dataset_roberta = val_dataset.rename_column("label", "labels")
train_dataset_roberta = train_dataset_roberta.map(tokenize_roberta, batched=True)
val_dataset_roberta = val_dataset_roberta.map(tokenize_roberta, batched=True)

Map:   0%|          | 0/17565 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

In [37]:
test_dataset_roberta = test_dataset.rename_column("label", "labels")
test_dataset_roberta = test_dataset_roberta.map(tokenize_roberta, batched=True)

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

In [None]:
training_args_roberta = TrainingArguments(
    output_dir='./emotion_classifier_roberta',
    evaluation_strategy="epoch",  
    save_strategy="no",        
    save_steps=None,              
    save_total_limit=1,           
    fp16=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.1,
    report_to="none",

    logging_strategy="steps",     # Loggin every X steps
    logging_steps=50,             # training loss every 50 steps
    logging_dir=None,             
    log_level="error",
    metric_for_best_model="eval_loss",
    gradient_accumulation_steps=2

)


PyTorch: setting up devices


In [44]:
trainer_roberta = Trainer(
    model=rob_model,
    args=training_args_roberta,
    train_dataset=train_dataset_roberta,
    eval_dataset=val_dataset_roberta,
    compute_metrics=compute_metrics
)

In [45]:
trainer_roberta.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8246,1.386564,0.609559,0.595945
2,0.5665,1.495291,0.604988,0.599661
3,0.5151,1.46842,0.612053,0.610287


TrainOutput(global_step=3294, training_loss=0.645306488203872, metrics={'train_runtime': 504.3691, 'train_samples_per_second': 104.477, 'train_steps_per_second': 6.531, 'total_flos': 1.227827524276224e+16, 'train_loss': 0.645306488203872, 'epoch': 3.0})

In [46]:
# Final evaluation
test_results_roberta = trainer_roberta.evaluate(eval_dataset=test_dataset_roberta)
print(test_results_roberta)

{'eval_loss': 1.4027531147003174, 'eval_accuracy': 0.619848304852417, 'eval_f1': 0.6153772270324698, 'eval_runtime': 19.2223, 'eval_samples_per_second': 569.288, 'eval_steps_per_second': 71.167, 'epoch': 3.0}


In [47]:
torch.save(rob_model.state_dict(), './emotion_detection_model_rob_dedupe.pt')

In [48]:
model_save_path = "./rob-large-emotion-detector_dedupe"

rob_model.save_pretrained(model_save_path)
rob_tokenizer.save_pretrained(model_save_path)

('./rob-large-emotion-detector_dedupe/tokenizer_config.json',
 './rob-large-emotion-detector_dedupe/special_tokens_map.json',
 './rob-large-emotion-detector_dedupe/vocab.json',
 './rob-large-emotion-detector_dedupe/merges.txt',
 './rob-large-emotion-detector_dedupe/added_tokens.json')

In [None]:
def test_classifier(texts, model, tokenizer, emotion_id_to_name):
    # Preprocess inputs
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    
    # Making predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Converting to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = probs.argmax(dim=-1)
    
    # Converting to human-readable labels
    return [{
        "text": texts[i],
        "predicted_emotion": emotion_id_to_name[pred.item()],
        "confidence": probs[i][pred].item()
    } for i, pred in enumerate(predictions)]

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


In [50]:
emotion_id_to_name = {v: k for k, v in emotion_to_id.items()}
test_texts = [
    "I'm really excited about the upcoming trip!",
    "I feel completely alone in this situation",
    "This constant pressure is making me anxious"
]

results = test_classifier(test_texts, rob_model, rob_tokenizer, emotion_id_to_name)
for result in results:
    print(f"Text: {result['text']}")
    print(f"Predicted Emotion: {result['predicted_emotion']} ({result['confidence']:.2%})")
    print()

Text: I'm really excited about the upcoming trip!
Predicted Emotion: excited (72.85%)

Text: I feel completely alone in this situation
Predicted Emotion: lonely (99.68%)

Text: This constant pressure is making me anxious
Predicted Emotion: anxious (93.76%)



### ----- Ongoing & Future Work on project -------

In [38]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./roberta-emotion-detector"
emotion_tokenizer = AutoTokenizer.from_pretrained(model_path)
emotion_model = AutoModelForSequenceClassification.from_pretrained(model_path)

Didn't find file ./roberta-emotion-detector/tokenizer.json. We won't load it.
Didn't find file ./roberta-emotion-detector/added_tokens.json. We won't load it.
loading file ./roberta-emotion-detector/vocab.json
loading file ./roberta-emotion-detector/merges.txt
loading file None
loading file None
loading file ./roberta-emotion-detector/special_tokens_map.json
loading file ./roberta-emotion-detector/tokenizer_config.json
loading configuration file ./roberta-emotion-detector/config.json
Model config RobertaConfig {
  "_name_or_path": "./roberta-emotion-detector",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "

In [39]:
emotion_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [45]:
import torch.nn.functional as F

def detect_emotion(user_input):
    """Predict emotion using your trained RoBERTa model."""
    inputs = emotion_tokenizer(user_input, return_tensors="pt")
    outputs = emotion_model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).item()
    
    return emotion_id_to_name[predicted_class]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Loading the pre-trained chatbot model
chatbot_model_name = "microsoft/DialoGPT-medium"
chatbot_tokenizer = AutoTokenizer.from_pretrained(chatbot_model_name)
chatbot_model = AutoModelForCausalLM.from_pretrained(chatbot_model_name)

loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/vocab.json from cache at /user/siyer8/.cache/huggingface/transformers/16b07bde9fc789a1d5bafeeb361edfe9e4df30077f3f8150f33130800dd9fab7.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/merges.txt from cache at /user/siyer8/.cache/huggingface/transformers/198d2773a3a47fe909fd8bf2ab9d40f0c1355d9a45a3ecac510ab2d44390577c.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/tokenizer_config.json from cache at 

In [None]:
def generate_chatbot_response(user_input, chat_history):
    """Generate response using DialoGPT with conversation history."""
    new_user_input_ids = chatbot_tokenizer.encode(user_input + chatbot_tokenizer.eos_token, return_tensors='pt')
    
    # Concatenating past conversation for context
    if chat_history is not None:
        bot_input_ids = torch.cat([chat_history, new_user_input_ids], dim=-1)
    else:
        bot_input_ids = new_user_input_ids
    # Generating response
    chat_history = chatbot_model.generate(bot_input_ids, max_length=1000, pad_token_id=chatbot_tokenizer.eos_token_id)
    response = chatbot_tokenizer.decode(chat_history[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    
    return response, chat_history

In [None]:
import random
def adjust_response_based_on_emotion(response, emotion):
    """Modify chatbot response based on detected emotion."""
    emotion_responses = {
        "joy": " I'm happy to hear that! ",
        "anger": " I understand, that sounds frustrating. ",
        "sadness": " I'm here for you. ",
        "fear": " That sounds scary. Stay safe! ",
        "surprise": " Oh wow! That's unexpected. ",
        "neutral": " Got it! "
    }
    prefix = random.choice(emotion_responses.get(emotion, [""]))  # random variation
    return f"{prefix} {response}" if prefix else response


In [None]:
def chat():
    """function to run the chatbot."""
    chat_history = None
    print("Emotionally Aware Chatbot (type 'exit' to quit)")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye! Take care. 😊")
            break

        # Step 1: Detecting Emotion
        emotion = detect_emotion(user_input)
        # Step 2: Generating Chatbot Response
        response, chat_history = generate_chatbot_response(user_input, chat_history)

        # Step 3: Adjusting Response Based on Emotion
        adjusted_response = adjust_response_based_on_emotion(response, emotion)

        print(f"Chatbot: {adjusted_response}")

if __name__ == "__main__":
    chat()


Emotionally Aware Chatbot (type 'exit' to quit)


You:  Hi


Chatbot: Hi! :D


You:  How are you ?


Chatbot: I'm good, how are you?


You:  I am feeling a bit down today


Chatbot: I'm sorry to hear that.


You:  I don't understand what's wrong with me


Chatbot: I don't either.


You:  Can't you help me


Chatbot: I can't.


You:  Why


Chatbot: I can't


You:  Please


Chatbot: I can't


You:  You are bad


Chatbot: I can't


You:  Say anything else


Chatbot: I can't


You:  exit


Goodbye! Take care. 😊


In [57]:
train_dataset_gen = Dataset.from_file("./empathetic_dialogues-train.arrow")

val_dataset_gen = Dataset.from_file("./empathetic_dialogues-validation.arrow")

test_dataset_gen = Dataset.from_file("./empathetic_dialogues-test.arrow")

In [None]:
def load_empathetic_dialogues_for_dialogpt(dataset_gen):
    # Creating the mapping of emotions
    emotions = list(set(dataset_gen['train']['context']))
    emotion_to_id = {emotion: idx for idx, emotion in enumerate(emotions)}
    id_to_emotion = {idx: emotion for emotion, idx in emotion_to_id.items()}
    
    def preprocess(example):
        return {
            'input': example['prompt'],  # input to DialoGPT
            'output': example['utterance']  # output from DialoGPT
        }
    
    return dataset_gen.map(preprocess, remove_columns=['conv_id', 'utterance_idx', 'speaker_idx', 'selfeval', 'tags']), emotion_to_id, id_to_emotion


In [63]:
dataset_gen = DatasetDict({
    "train": train_dataset_gen,
    "validation": val_dataset_gen,
    "test": test_dataset_gen
})

In [64]:
dataset_gen, emotion_to_id_gen, id_to_emotion_gen = load_empathetic_dialogues_for_dialogpt(dataset_gen)

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

In [None]:
chatbot_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
chatbot_tokenizer.pad_token = chatbot_tokenizer.eos_token

loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/vocab.json from cache at /user/siyer8/.cache/huggingface/transformers/16b07bde9fc789a1d5bafeeb361edfe9e4df30077f3f8150f33130800dd9fab7.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/merges.txt from cache at /user/siyer8/.cache/huggingface/transformers/198d2773a3a47fe909fd8bf2ab9d40f0c1355d9a45a3ecac510ab2d44390577c.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/tokenizer_config.json from cache at 

In [None]:
def tokenize_dialogpt(batch):
    # Combining the context and response as input-output pairs for training
    # Example format: "User: {input} Bot: {output}"

    inputs = [f"User: {x} Bot:" for x in batch['input']]
    targets = [x for x in batch['output']] 

    # Tokenizing both input and output
    input_encodings = chatbot_tokenizer(inputs, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    target_encodings = chatbot_tokenizer(targets, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

    # Returning tokenized inputs and outputs
    return {
        'input_ids': input_encodings['input_ids'],
        'labels': target_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask']
    }


In [None]:
# train, validation, and test datasets
train_data_gen = dataset_gen['train'].map(tokenize_dialogpt, batched=True)
val_data_gen = dataset_gen['validation'].map(tokenize_dialogpt, batched=True)
test_data_gen = dataset_gen['test'].map(tokenize_dialogpt, batched=True)

Map:   0%|          | 0/76673 [00:00<?, ? examples/s]

Map:   0%|          | 0/12030 [00:00<?, ? examples/s]

Map:   0%|          | 0/10943 [00:00<?, ? examples/s]

#### Training generator models

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

# Loading pre-trained DialoGPT model
chatbot_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

loading configuration file https://huggingface.co/microsoft/DialoGPT-medium/resolve/main/config.json from cache at /user/siyer8/.cache/huggingface/transformers/066c0238a1dab50404e7d118e7ad1468d20a1fc18c3f2ad1036366759bfc343d.c26bcfbd792a38251a4fb555d9110e87dcc2ecaee13ac0a027d1584df8a09634
Model config GPT2Config {
  "_name_or_path": "microsoft/DialoGPT-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "sum

In [None]:
# training arguments
training_args_gen = TrainingArguments(
    output_dir="./dialogpt_finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,  # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,
    save_strategy='epoch',  
    save_total_limit=2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [85]:
# Define the Trainer
trainer = Trainer(
    model=chatbot_model,
    args=training_args_gen,
    train_dataset=train_data_gen,  # Training dataset
    eval_dataset=val_data_gen,  # Validation dataset
    tokenizer=chatbot_tokenizer,
)