### Multi-Class Text Classification for Emotions using BERT

In [None]:
# ! pip install datasets huggingface_hub ipywidgets evaluate 'transformers[torch]' torch xformers plotnine

In [3]:
import numpy as np
import pandas as pd

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *

import random
import evaluate

from datasets import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from tqdm import tqdm

from plotnine import ggplot, aes, geom_tile, coord_flip,theme,geom_line,labs,element_text
from plotnine import scale_x_discrete,geom_vline


#### Get the data

In [5]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)
df_clean

Unnamed: 0,text,id,author,subreddit,rater_id,admiration,amusement,anger,annoyance,approval,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211219,"Well, I'm glad you're out of all that now. How...",ed89acy,pompompompi,raisedbynarcissists,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211220,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,16,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,ee8hse1,springt1me,shittyfoodporn,70,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
clustered_df = create_clustered_df(df_clean)
clustered_df

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea
...,...,...,...,...,...,...,...,...,...
211219,"Well, I'm glad you're out of all that now. How...",ed89acy,pompompompi,raisedbynarcissists,2,joy,exc_joy,exc_joy_lov,amu_exc_joy_lov
211220,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,16,love,love,exc_joy_lov,amu_exc_joy_lov
211221,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,15,caring,caring,des_opt_car,des_opt_car
211222,That looks amazing,ee8hse1,springt1me,shittyfoodporn,70,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea


In [9]:
plutchik_df = create_plutchik_df(df_clean)
plutchik_df

Unnamed: 0,text,id,author,subreddit,rater_id,level0,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,betrübt
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,verliebt
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,gratitude,ehrfürchtig
...,...,...,...,...,...,...,...
211219,"Well, I'm glad you're out of all that now. How...",ed89acy,pompompompi,raisedbynarcissists,2,joy,begeistert
211220,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,16,love,verliebt
211221,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,15,caring,bewundernd
211222,That looks amazing,ee8hse1,springt1me,shittyfoodporn,70,admiration,bewundernd


### BERT for level 0 -> 27 emotions
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [14]:
clustered_df.level0.unique()

array(['sadness', 'neutral', 'love', 'gratitude', 'disapproval',
       'amusement', 'disappointment', 'realization', 'admiration',
       'annoyance', 'confusion', 'optimism', 'excitement', 'caring',
       'remorse', 'joy', 'approval', 'embarrassment', 'surprise',
       'curiosity', 'anger', 'grief', 'disgust', 'pride', 'desire',
       'relief', 'fear', 'nervousness'], dtype=object)

In [46]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # no differentiation between upper and lower case

In [47]:
id2label = {0: 'sadness', 1: 'neutral', 2: 'love', 3: 'gratitude', 4: 'disapproval',
       5: 'amusement', 6: 'disappointment', 7: 'realization', 8: 'admiration', 9:
       'annoyance', 10: 'confusion', 11: 'optimism', 12: 'excitement', 13: 'caring',
       14: 'remorse', 15: 'joy', 16: 'approval', 17: 'embarrassment', 18: 'surprise',
       19: 'curiosity', 20: 'anger', 21: 'grief', 22: 'disgust', 23: 'pride', 24: 'desire',
       25: 'relief', 26: 'fear', 27: 'nervousness'}
label2id = {'sadness': 0, 'neutral': 1, 'love': 2, 'gratitude': 3, 'disapproval': 4,
       'amusement': 5, 'disappointment': 6, 'realization': 7, 'admiration': 8,
       'annoyance': 9, 'confusion': 10, 'optimism': 11, 'excitement': 12, 'caring': 13,
       'remorse': 14, 'joy': 15, 'approval': 16, 'embarrassment': 17, 'surprise': 18,
       'curiosity': 19, 'anger': 20, 'grief': 21, 'disgust': 22, 'pride': 23, 'desire': 24,
       'relief': 25, 'fear': 26, 'nervousness': 27}

In [18]:
# create a sample for lokal tests
random.seed(123)
dataset = clustered_df.sample(n=1000, replace=False)
dataset["label0"] = dataset["level0"].map(label2id.get) # to add column label map individual entries of emotions to ID
dataset

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label0
26641,"if HL in offensive stance dodges it, then does...",eef45jf,GaolGhoul,CompetitiveForHonor,17,realization,app_rea,app_rea,pri_adm_gra_rel_app_rea,7
190616,No. It IS her condo and she CAN smoke inside h...,ed71emb,Applesaucedontlie,legaladvice,4,approval,app_rea,app_rea,pri_adm_gra_rel_app_rea,16
52488,"Love those clips on YouTube, does this add any...",eczl7i9,toxicbrew,netflix,46,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis,9
190939,"Don't know if it's a full moon tonight, but ho...",ed4708f,sritte02,TalesFromRetail,36,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis,9
27240,Hello!!,ees7cfh,sassypuff01,antiMLM,18,neutral,neutral,neutral,neutral,1
...,...,...,...,...,...,...,...,...,...,...
131015,She was HOPING she was gone!,eeeoavh,RealHausFrau,loveafterlockup,30,desire,des_opt,des_opt_car,des_opt_car,24
201100,I don't really understand what they are even p...,eez4aas,Nuclearfrog,ukpolitics,35,confusion,cur_con,sur_cur_con,sur_cur_con,10
145855,"Yeah, surely this is *exactly* what democracy ...",ee5gal8,NearbyBush,Scotland,81,curiosity,cur_con,sur_cur_con,sur_cur_con,19
182196,"Eastern suburbs, the mullet never went out of ...",edk0ssq,_Dindu__Nuffin_,perth,3,approval,app_rea,app_rea,pri_adm_gra_rel_app_rea,16


In [19]:
# split the training data

training_data = dataset.groupby("level0").sample(frac=0.8, random_state=25) # stratified sampling
testing_data = dataset.drop(training_data.index)

training_data = Dataset.from_pandas(training_data) # create transformers compatible dataset from dataframe
testing_data = Dataset.from_pandas(testing_data)

def tokenize_function(examples): # replace representation of data, convert column text to tensor-based representation
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_training_data = training_data.map(tokenize_function, batched=True) # convert text to tensor form
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [20]:
tokenized_training_data

Dataset({
    features: ['text', 'id', 'author', 'subreddit', 'rater_id', 'level0', 'level1', 'level2', 'level3', 'label0', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 798
})

In [32]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

annoyance_example = random.sample(list(dataset.id[dataset.level0 == "annoyance"]), k=1) # example for annoyance
dataset.query('id==@annoyance_example')  


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label0
110753,No idea what this means but the fact that you said sweetie lets me know that you’re probably really obnoxious or just being sarcastic,edxzz1l,Plasma454345,iamverysmart,72,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis,9


In [33]:
desire_example = random.sample(list(dataset.id[dataset.level0 == "desire"]), k=1) # example for desire
dataset.query('id==@desire_example') 

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label0
173766,"Bro, too close to home. Thing is I wish I could just chill where this monk is at, with a lot of open space.",edx2c5f,BlueLanternSupes,2meirl4meirl,63,desire,des_opt,des_opt_car,des_opt_car,24


In [34]:
# check if data set is balanced
classCounts = dataset.level0.value_counts() 
print(classCounts)
# -> not balanced

neutral           305
approval           59
annoyance          56
admiration         54
disapproval        48
gratitude          41
confusion          38
curiosity          37
love               34
surprise           30
anger              30
disappointment     28
optimism           27
sadness            26
amusement          24
excitement         23
desire             23
joy                23
caring             21
disgust            19
realization        18
embarrassment      11
relief              8
remorse             5
fear                5
nervousness         4
grief               2
pride               1
Name: level0, dtype: int64


In [35]:
numberOfDocuments = len(dataset)
numberOfDocuments

1000

In [None]:
"""
#Classifier
from huggingface_hub import notebook_login
notebook_login()
"""

In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Padding -> map all tensors to the same size
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [37]:
accuracy = evaluate.load("accuracy") # define evaluation method -> quality

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [38]:
def compute_metrics(eval_pred): # function calculation metric

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [48]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=28, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

In [49]:
# training

training_args = TrainingArguments(
    output_dir="../models/model_level0",
    learning_rate=2e-5,  # standard
    per_device_train_batch_size=16, # size in which chunks are entered into the network, on how many data parallel weights are trained
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch", # save model per epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False #,
    #label_names=["level0"],
)

# IMPORTANT: Set: Model, dataset, ... , define learning process, metrics, ...

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_testing_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  
)

#checkpointing
#use cuda
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

RuntimeError: [enforce fail at C:\b\abs_abjetg6_iu\croot\pytorch_1686932924616\work\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 201326592 bytes.

In [None]:
trainer.save_model("../models/model_level0_1epoch")

In [None]:
classifier = pipeline("text-classification", model="../models/model_level0_1epoch",device=0) # method pipeline -> sting for textclassificaton, folder, device (graphics card)
results = [classifier(text,truncation=True) for text in tqdm(dataset.text.to_list())] # listcomprehension over all texts, tokenization in model, truncation -> padding too long texts

In [None]:
results = [tmp[0] for tmp in results]

In [None]:
results # list of dictionaries with labels, score -> decision and how high activation function for decision was
pd.DataFrame(results).to_pickle("../results/results_level0_1epoch.pkl")  # convert as dataframe, pick, safe

### Evaluation

In [None]:
data_classifies = pd.concat([dataset,pd.DataFrame.from_dict(results)],axis=1)
# merge classified data with original training data
# combine data with training data, concatenate results results and training data
# compare -> calculate f1

In [None]:
data_classifies.to_pickle("../results/data_classified_level0_1epoch.pkl")  
# data_classifies = pd.read_pickle("../results/data_classified_level0_1epoch.pkl")

In [None]:
data_classifies # contain goldstandard and ergbnis cluster -> calculate F1, Precision, Recall
# label -> assigned by classifier (?????)
# level0 -> original label

In [None]:
test_data = data_classifies.query(f'id in {tokenized_testing_data["id"]}')
# tokenized_testing_data: daten der Testdaten

In [None]:
target_names = ['sadness', 'neutral', 'love', 'gratitude', 'disapproval',
       'amusement', 'disappointment', 'realization', 'admiration',
       'annoyance', 'confusion', 'optimism', 'excitement', 'caring',
       'remorse', 'joy', 'approval', 'embarrassment', 'surprise',
       'curiosity', 'anger', 'grief', 'disgust', 'pride', 'desire',
       'relief', 'fear', 'nervousness']
print(classification_report(test_data.level0, test_data.label, target_names=target_names))
# level0 -> gold standard , label -> prediction

In [None]:
pd.DataFrame.from_dict(results)

In [None]:
#Final Classification/Viz
final = pd.concat([dataset, pd.DataFrame.from_dict(results)],axis=1) # attach classified label to data
final['label'].value_counts()/final['label'].value_counts().sum() # ratio

In [None]:
list(range(1,12))