# Model Training

In [36]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, Sequence, Value
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate
import torch
from torch.utils.data import DataLoader

## Dataset Preparation

Will be training 3 datasets and comparing performances:
1. goEmotions + other datasets + textattack data augmentation
2. goEmotions + other datasets
3. original goEmotions dataset

In [2]:
random_seed = 1234

In [4]:
labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

label2id = {label:idx for idx, label in enumerate(labels)}
id2label = {idx:label for idx, label in enumerate(labels)}

### Load base model and tokenizer

In [9]:
model_name = "distilbert/distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(labels),     
    problem_type="multi_label_classification", # uses BCEWithLogitsLoss by default)
    id2label=id2label,
    label2id=label2id
) 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# padding will be dynamically done in batching with DataCollatorWithPadding
def tokenize_func(examples):
    return tokenizer(examples['text'], truncation=True)

### Load Augmented dataset
- goEmotions + other datasets + textattack data augmentation

In [34]:
augmented_dataset = load_from_disk('./datasets/cleaned_hf/augmented_hf')
augmented_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 88944
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 10426
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 12721
    })
})

In [37]:
type(augmented_dataset['train']['labels'][1][0])

int

In [38]:
# In order to use BCEWithLogitsLoss, we need to convert labels to float or it'll give errors
# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb
# https://discuss.pytorch.org/t/multi-label-binary-classification-result-type-float-cant-be-cast-to-the-desired-output-type-long/117915/3
augmented_dataset = augmented_dataset.cast_column('labels', Sequence(feature=Value(dtype='float32')))

Casting the dataset:   0%|          | 0/88944 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12721 [00:00<?, ? examples/s]

In [42]:
type(augmented_dataset['train']['labels'][1][6])

float

In [None]:
# before shuffle
augmented_dataset['train']['text'][:5]

["I'm sorry to hear, come see me when you die.",
 'i feel terrified because even if i have the time to write out how i feel about mr',
 'I asked at the Bodies Reveled show if they used prisoner bodies.. I got an awkward no.',
 'i feel awful for making this all about me and my flawed academia instilled value system but my brain won t shut up about it',
 'i feel so repressed when compared to dear a href http eurodancemix']

In [42]:
# shuffle training set
augmented_dataset['train'] = augmented_dataset['train'].shuffle(seed=random_seed)

In [None]:
# after shuffle
augmented_dataset['train']['text'][:5]

["I gave up trying to be 'normal' years ago. But I get your point though.",
 '@blue_north27 http://twitpic.com/4jcjr - Mmm yummy... looks like an invitation to me',
 'I feel a connection to this woman',
 'My friends are awesome! @JNBlack @koreantomcruise -- and the non Twitter ones here right now too!!',
 'People like you is also why no players want to play and stay in Orlando']

#### Calculate Class Weights

In [63]:
from generate_class_weights import generate_class_weights 

In [None]:
augmented_class_weights = generate_class_weights(augmented_dataset['train']['labels'], multi_class=False, one_hot_encoded=True)
augmented_class_weights

{0: 0.7691456243514355,
 1: 1.3645066273932254,
 2: 0.49946091644204854,
 3: 1.2860613071139386,
 4: 1.0808341029504691,
 5: 1.779591836734694,
 6: 1.401840877569033,
 7: 1.4498272152311404,
 8: 2.988308023115173,
 9: 1.549547038327526,
 10: 1.5710046629927936,
 11: 0.9356616873553545,
 12: 6.0391091797935905,
 13: 2.286948472693613,
 14: 0.8414758751182593,
 15: 1.1933025652033917,
 16: 25.210884353741495,
 17: 0.3419344917730278,
 18: 0.4463357353620105,
 19: 12.078218359587181,
 20: 0.8910438789821679,
 21: 19.488168273444348,
 22: 1.7916364515349288,
 23: 1.1817602040816326,
 24: 3.397402597402597,
 25: 0.26198527245949926,
 26: 0.8437108708025043,
 27: 0.15054127427948574}

#### Tokenization

In [None]:
# tokenize in batch
tokenized_augmented = augmented_dataset.map(tokenize_func, batched=True)
tokenized_augmented

Map:   0%|          | 0/88944 [00:00<?, ? examples/s]

Map:   0%|          | 0/10426 [00:00<?, ? examples/s]

Map:   0%|          | 0/12721 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 88944
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10426
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 12721
    })
})

### Load merged dataset
- goEmotions + other datasets

In [47]:
merged_dataset = load_from_disk('./datasets/cleaned_hf/merged_hf')
merged_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 83016
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 10426
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 12721
    })
})

In [48]:
type(merged_dataset['test']['labels'][1][9])

int

In [49]:
merged_dataset = merged_dataset.cast_column('labels', Sequence(feature=Value(dtype='float32')))

Casting the dataset:   0%|          | 0/83016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/10426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/12721 [00:00<?, ? examples/s]

In [54]:
type(merged_dataset['test']['labels'][1][9])

float

In [46]:
# before shuffle
merged_dataset['train']['text'][:5]

["My favourite food is anything I didn't have to cook myself.",
 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead',
 'WHY THE FUCK IS BAYLESS ISOING',
 'To make her feel threatened',
 'Dirty Southern Wankers']

In [47]:
# shuffle training set
merged_dataset['train'] = merged_dataset['train'].shuffle(seed=random_seed)

In [48]:
# after shuffle
merged_dataset['train']['text'][:5]

["[NAME] is way more straight forward than [NAME], she doesn't have tons of mixups and stances etc. Her strings strong but simple with limited options ",
 'She is married to a banker and I posted about her views that any trump fan would love',
 'We got killed by the 7-11 guy yesterday so [NAME] had to bring out the valet guy.',
 '@730Fam yeah you are right! but my mom can cook like she is straight from the island! make sure you invite me to your fam restaurant',
 'Thank you. [NAME] killed it, earned his spot. Love my packers but sometimes people in this sub are ridiculous ']

#### Calculate class weights

In [70]:
merged_class_weights = generate_class_weights(merged_dataset['train']['labels'], multi_class=False, one_hot_encoded=True)
merged_class_weights

{0: 0.7178830854375648,
 1: 1.2735640648011781,
 2: 0.4661725067385445,
 3: 1.200347021399653,
 4: 1.0087979390463229,
 5: 2.7275594690498095,
 6: 2.167293233082707,
 7: 1.3531981482688922,
 8: 4.625362157343437,
 9: 2.3363728470111447,
 10: 1.466299279355659,
 11: 0.8733010730065222,
 12: 9.785007072135786,
 13: 3.4757996985429576,
 14: 0.7853926206244087,
 15: 1.1137705269936675,
 16: 38.50463821892394,
 17: 0.31914500999538675,
 18: 0.41658804873642585,
 19: 18.078397212543553,
 20: 0.8316569825686235,
 21: 26.71042471042471,
 22: 2.671042471042471,
 23: 2.157829070492826,
 24: 5.4401048492791615,
 25: 0.24452430044182621,
 26: 0.787478656801366,
 27: 0.14050789739145741}

#### Toeknization

In [None]:
# tokenize in batch
tokenized_merged = merged_dataset.map(tokenize_func, batched=True)
tokenized_merged

Map:   0%|          | 0/83016 [00:00<?, ? examples/s]

Map:   0%|          | 0/10426 [00:00<?, ? examples/s]

Map:   0%|          | 0/12721 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 83016
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 10426
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 12721
    })
})

### Load goEmotions dataset
- goEmotions

In [55]:
go_dataset = load_from_disk('./datasets/cleaned_hf/goEmotions_cleaned')
go_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5427
    })
})

In [None]:
type(go_dataset['test']['labels'][8][4])

int

In [58]:
go_dataset = go_dataset.cast_column('labels', Sequence(feature=Value(dtype='float32')))

Casting the dataset:   0%|          | 0/43410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [59]:
type(go_dataset['test']['labels'][8][4])

float

In [51]:
# before shuffle
go_dataset['train']['text'][:5]

["My favourite food is anything I didn't have to cook myself.",
 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead',
 'WHY THE FUCK IS BAYLESS ISOING',
 'To make her feel threatened',
 'Dirty Southern Wankers']

In [52]:
# shuffle training set
go_dataset['train'] = go_dataset['train'].shuffle(seed=random_seed)

In [53]:
# after shuffle
go_dataset['train']['text'][:5]

['The hardship of having the future meet the present. (or something)',
 'That or spot up shooting. I like the confidence though, he’s not scared to expand his game.',
 'For serial rapists, its not about the physical pleasure. Its all head games. Damn freaks',
 'No room. Get lost. I cc u!',
 "I did that my first day of middle school (got off at high school instead) but that's probably because I'm an idiot"]

#### Calculate class weights

In [71]:
go_class_weights = generate_class_weights(merged_dataset['train']['labels'], multi_class=False, one_hot_encoded=True)
go_class_weights

{0: 0.7178830854375648,
 1: 1.2735640648011781,
 2: 0.4661725067385445,
 3: 1.200347021399653,
 4: 1.0087979390463229,
 5: 2.7275594690498095,
 6: 2.167293233082707,
 7: 1.3531981482688922,
 8: 4.625362157343437,
 9: 2.3363728470111447,
 10: 1.466299279355659,
 11: 0.8733010730065222,
 12: 9.785007072135786,
 13: 3.4757996985429576,
 14: 0.7853926206244087,
 15: 1.1137705269936675,
 16: 38.50463821892394,
 17: 0.31914500999538675,
 18: 0.41658804873642585,
 19: 18.078397212543553,
 20: 0.8316569825686235,
 21: 26.71042471042471,
 22: 2.671042471042471,
 23: 2.157829070492826,
 24: 5.4401048492791615,
 25: 0.24452430044182621,
 26: 0.787478656801366,
 27: 0.14050789739145741}

#### Tokenization

In [None]:
# tokenize in batch
tokenized_go = go_dataset.map(tokenize_func, batched=True)
tokenized_go

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5427
    })
})

#### Batching and Dynamic padding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training
- Will be using `Trainer` instead of `SFTTrainer` because `SFTTrainer` is often for llms.

In [None]:
# batching and dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    './models',
    evaluation_strategy="epoch"
    )

#### Compute metrics function

In [None]:
# Load metrics
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    precision_result = precision.compute(predictions=predictions, references=labels)
    recall_result = recall.compute(predictions=predictions, references=labels)
    f1_weighted = f1.compute(predictions=predictions, references=labels, average="weighted")
    f1_micro = f1.compute(predictions=predictions, references=labels, average="micro")
    f1_macro = f1.compute(predictions=predictions, references=labels, average="macro")


    
    return metric.compute(prediction)

### Augmented

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_augmented["train"],
    eval_dataset=tokenized_augmented["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

### Merged

### Go