# Model Training

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, Sequence, Value
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate
import torch
from torch.nn import BCEWithLogitsLoss
from typing import Optional, Dict
from torch import FloatTensor, tensor
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import hamming_loss

## Dataset Preparation

Will be training 3 datasets and comparing performances:
1. goEmotions + other datasets + textattack data augmentation
2. goEmotions + other datasets
3. original goEmotions dataset

In [2]:
random_seed = 1234

In [4]:
labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

label2id = {label:idx for idx, label in enumerate(labels)}
id2label = {idx:label for idx, label in enumerate(labels)}

### Load base model and tokenizer

In [9]:
model_name = "distilbert/distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(labels),     
    problem_type="multi_label_classification", # uses BCEWithLogitsLoss by default)
    id2label=id2label,
    label2id=label2id
) 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# padding will be dynamically done in batching with DataCollatorWithPadding
def tokenize_func(examples):
    return tokenizer(examples['text'], truncation=True)

### Load datasets
Will be training 3 different datasets
- goEmotions (go)
- goEmotions + other datasets (merged)
- goEmotions + other datasets + textattack data augmentation (augmented)

In [None]:
# augmented
dataset = load_from_disk('./datasets/cleaned_hf/augmented_hf')
# # merged
# dataset = load_from_disk('./datasets/cleaned_hf/merged_hf')
# # go
# dataset = load_from_disk('./datasets/cleaned_hf/goEmotions_cleaned')

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 88944
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 10426
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 12721
    })
})

In [None]:
type(dataset['train']['labels'][1][0])

int

In [None]:
# In order to use BCEWithLogitsLoss, we need to convert labels to float or it'll give errors
# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb
# https://discuss.pytorch.org/t/multi-label-binary-classification-result-type-float-cant-be-cast-to-the-desired-output-type-long/117915/3
dataset = dataset.cast_column('labels', Sequence(feature=Value(dtype='float32')))

Casting the dataset:   0%|          | 0/88944 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12721 [00:00<?, ? examples/s]

In [None]:
type(dataset['train']['labels'][1][6])

float

In [None]:
# before shuffle
dataset['train']['text'][:5]

["I'm sorry to hear, come see me when you die.",
 'i feel terrified because even if i have the time to write out how i feel about mr',
 'I asked at the Bodies Reveled show if they used prisoner bodies.. I got an awkward no.',
 'i feel awful for making this all about me and my flawed academia instilled value system but my brain won t shut up about it',
 'i feel so repressed when compared to dear a href http eurodancemix']

In [None]:
# shuffle training set
dataset['train'] = dataset['train'].shuffle(seed=random_seed)

In [None]:
# after shuffle
dataset['train']['text'][:5]

["I gave up trying to be 'normal' years ago. But I get your point though.",
 '@blue_north27 http://twitpic.com/4jcjr - Mmm yummy... looks like an invitation to me',
 'I feel a connection to this woman',
 'My friends are awesome! @JNBlack @koreantomcruise -- and the non Twitter ones here right now too!!',
 'People like you is also why no players want to play and stay in Orlando']

#### Calculate Class Weights

In [63]:
from generate_class_weights import generate_class_weights 

In [None]:
class_weights = generate_class_weights(dataset['train']['labels'], multi_class=False, one_hot_encoded=True)
class_weights

{0: 0.7691456243514355,
 1: 1.3645066273932254,
 2: 0.49946091644204854,
 3: 1.2860613071139386,
 4: 1.0808341029504691,
 5: 1.779591836734694,
 6: 1.401840877569033,
 7: 1.4498272152311404,
 8: 2.988308023115173,
 9: 1.549547038327526,
 10: 1.5710046629927936,
 11: 0.9356616873553545,
 12: 6.0391091797935905,
 13: 2.286948472693613,
 14: 0.8414758751182593,
 15: 1.1933025652033917,
 16: 25.210884353741495,
 17: 0.3419344917730278,
 18: 0.4463357353620105,
 19: 12.078218359587181,
 20: 0.8910438789821679,
 21: 19.488168273444348,
 22: 1.7916364515349288,
 23: 1.1817602040816326,
 24: 3.397402597402597,
 25: 0.26198527245949926,
 26: 0.8437108708025043,
 27: 0.15054127427948574}

#### Tokenization

In [None]:
# tokenize in batch
tokenized_dataset = dataset.map(tokenize_func, batched=True)
tokenized_dataset

Map:   0%|          | 0/88944 [00:00<?, ? examples/s]

Map:   0%|          | 0/10426 [00:00<?, ? examples/s]

Map:   0%|          | 0/12721 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 88944
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10426
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 12721
    })
})

#### Batching and Dynamic padding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training
- Will be using `Trainer` instead of `SFTTrainer` because `SFTTrainer` is often for llms.

In [None]:
# batching and dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    './models/',
    evaluation_strategy="epoch",
    )

#### Compute metrics function

In [None]:
# Load metrics
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # # returns dictionary like this: {'precision': 1.0}
    # precision_result = precision.compute(predictions=predictions, references=labels)
    # recall_result = recall.compute(predictions=predictions, references=labels)
    # f1_weighted = f1.compute(predictions=predictions, references=labels, average="weighted")
    # f1_micro = f1.compute(predictions=predictions, references=labels, average="micro")
    # f1_macro = f1.compute(predictions=predictions, references=labels, average="macro")
    

    # return {
    #     "precision": precision_result["precision"],
    #     "recall": recall_result["recall"],
    #     "f1_weighted": f1_weighted["f1"],
    #     "f1_micro": f1_micro["f1"],
    #     "f1_macro": f1_macro["f1"]
    # }

    # Generate classification report
    report = classification_report(
        y_true=labels,
        y_pred=predictions,
        output_dict=True,  # Convert report to a dictionary
        zero_division=0  # Avoid division errors for missing labels
    )

    # Extract key metrics
    micro_precision = report["micro avg"]["precision"]
    micro_recall = report["micro avg"]["recall"]
    micro_f1 = report["micro avg"]["f1-score"]

    macro_precision = report["macro avg"]["precision"]
    macro_recall = report["macro avg"]["recall"]
    macro_f1 = report["macro avg"]["f1-score"]

    weighted_precision = report["weighted avg"]["precision"]
    weighted_recall = report["weighted avg"]["recall"]
    weighted_f1 = report["weighted avg"]["f1-score"]

    # Combine all metrics into a dictionary
    metrics = {
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
        "weighted_f1": weighted_f1
    }

    return metrics

#### Custom Class Weighting Trainer Class

In [None]:
# Reference: https://discuss.huggingface.co/t/mullti-label-text-classification/44233/3
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights: Optional[Dict[int, float]] = None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            # dict --> FloatTensor
            if isinstance(class_weights, dict):
                # Convert to list of values and then to FloatTensor
                class_weights = tensor(list(class_weights.values()), dtype=torch.float)
                logging.info(f"Converted class_weights to FloatTensor: {class_weights}")
            elif not isinstance(class_weights, FloatTensor):
                raise ValueError("class_weights must be a dict or a FloatTensor")
            class_weights = class_weights.to(self.args.device)

        self.loss_fct = BCEWithLogitsLoss(pos_weight=class_weights)

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        try:
            loss = self.loss_fct(outputs.logits.view(-1, model.num_labels), labels.view(-1, model.num_labels))
        except AttributeError:  # DataParallel
            loss = self.loss_fct(outputs.logits.view(-1, model.module.num_labels), labels.view(-1, model.num_labels))

        return (loss, outputs) if return_outputs else loss

### Augmented

In [None]:
trainer = WeightedTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate()

#### Evaluate on test

In [None]:
trainer.predict(tokenized_dataset['test'])