### Multi-Class Text Classification for Emotions using BERT

In [1]:
! pip install datasets huggingface_hub ipywidgets evaluate 'transformers[torch]' torch xformers plotnine

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import numpy as np
import pandas as pd

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *

import random
import evaluate

from datasets import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from tqdm import tqdm

from plotnine import ggplot, aes, geom_tile, coord_flip,theme,geom_line,labs,element_text
from plotnine import scale_x_discrete,geom_vline


#### Get the data

In [None]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)
r, c = df_clean.shape
print(f"The data has {r} row and {c} columns")
df_clean

The data has 67486 row and 33 columns


Unnamed: 0,text,id,author,subreddit,rater_id,admiration,amusement,anger,annoyance,approval,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82855,"Welcome back, [NAME]",edy10i8,DenverNugs,denvernuggets,46,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82856,"Clear your notifications, good [NAME]!!",ef9wesm,[deleted],Tinder,18,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82857,"Thanks, I hate it.",ed8x1lm,SKEFFboy,gay_irl,3,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82858,No need for pity! It was a happy moment. Tears...,edtvm5p,danemo1897,confessions,40,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
clustered_df = create_clustered_df(df_clean)

r, c = clustered_df.shape
print(f"The data has {r} row and {c} columns")
clustered_df

The data has 67486 row and 9 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea
...,...,...,...,...,...,...,...,...,...
82855,"Welcome back, [NAME]",edy10i8,DenverNugs,denvernuggets,46,joy,exc_joy,exc_joy_lov,amu_exc_joy_lov
82856,"Clear your notifications, good [NAME]!!",ef9wesm,[deleted],Tinder,18,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea
82857,"Thanks, I hate it.",ed8x1lm,SKEFFboy,gay_irl,3,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis
82858,No need for pity! It was a happy moment. Tears...,edtvm5p,danemo1897,confessions,40,joy,exc_joy,exc_joy_lov,amu_exc_joy_lov


In [5]:
plutchik_df = create_plutchik_df(df_clean)

r, c = plutchik_df.shape
print(f"The data has {r} row and {c} columns")
plutchik_df

The data has 67486 row and 7 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,betrübt
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,verliebt
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral
5,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,gratitude,ehrfürchtig
...,...,...,...,...,...,...,...
82855,"Welcome back, [NAME]",edy10i8,DenverNugs,denvernuggets,46,joy,begeistert
82856,"Clear your notifications, good [NAME]!!",ef9wesm,[deleted],Tinder,18,admiration,bewundernd
82857,"Thanks, I hate it.",ed8x1lm,SKEFFboy,gay_irl,3,annoyance,wütend
82858,No need for pity! It was a happy moment. Tears...,edtvm5p,danemo1897,confessions,40,joy,begeistert


### BERT for level 0 -> 27 emotions
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [6]:
clustered_df.level0.unique()

array(['sadness', 'neutral', 'love', 'gratitude', 'disapproval',
       'amusement', 'disappointment', 'realization', 'admiration',
       'annoyance', 'confusion', 'optimism', 'excitement', 'caring',
       'remorse', 'joy', 'approval', 'embarrassment', 'surprise',
       'curiosity', 'anger', 'grief', 'disgust', 'pride', 'desire',
       'relief', 'fear', 'nervousness'], dtype=object)

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # no differentiation between upper and lower case

In [8]:
id2label = {0: 'sadness', 1: 'neutral', 2: 'love', 3: 'gratitude', 4: 'disapproval',
       5: 'amusement', 6: 'disappointment', 7: 'realization', 8: 'admiration', 9:
       'annoyance', 10: 'confusion', 11: 'optimism', 12: 'excitement', 13: 'caring',
       14: 'remorse', 15: 'joy', 16: 'approval', 17: 'embarrassment', 18: 'surprise',
       19: 'curiosity', 20: 'anger', 21: 'grief', 22: 'disgust', 23: 'pride', 24: 'desire',
       25: 'relief', 26: 'fear', 27: 'nervousness'}
label2id = {'sadness': 0, 'neutral': 1, 'love': 2, 'gratitude': 3, 'disapproval': 4,
       'amusement': 5, 'disappointment': 6, 'realization': 7, 'admiration': 8,
       'annoyance': 9, 'confusion': 10, 'optimism': 11, 'excitement': 12, 'caring': 13,
       'remorse': 14, 'joy': 15, 'approval': 16, 'embarrassment': 17, 'surprise': 18,
       'curiosity': 19, 'anger': 20, 'grief': 21, 'disgust': 22, 'pride': 23, 'desire': 24,
       'relief': 25, 'fear': 26, 'nervousness': 27}

In [9]:
# create a sample for lokal tests
dataset = clustered_df.sample(n=1000, replace=False, random_state=123)
dataset["label"] = dataset["level0"].map(label2id.get) # to add column label map individual entries of emotions to ID
dataset

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label
27054,"Oh, I always thought it was gum that had been ...",eez4phj,YuenHsiaoTieng,TheSimpsons,12,realization,app_rea,app_rea,pri_adm_gra_rel_app_rea,7
15473,She was using her lighter to heat up the cold ...,eeejsh7,8549176320,instant_regret,37,neutral,neutral,neutral,neutral,1
53734,"everyone else managed to stop, he was just car...",ed8auqx,M9ow,yesyesyesyesno,62,remorse,rem_emb,rem_emb,rem_emb_dis_sad_gri,14
15094,people have built such a stigma around it that...,ee103qd,_xJRHNBRx_,gay,15,realization,app_rea,app_rea,pri_adm_gra_rel_app_rea,7
67607,"I think the hulk could do all of that at once,...",efaudck,sixpacksitdown,youseeingthisshit,40,approval,app_rea,app_rea,pri_adm_gra_rel_app_rea,16
...,...,...,...,...,...,...,...,...,...,...
24621,"Those refs were terrible, god damn. GG Buckeye...",eetak09,touchadafishy,CollegeBasketball,39,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,20
74234,Oh well I guess. Can't blame you for that. We ...,edlin87,TimesPlus,lewronggeneration,55,neutral,neutral,neutral,neutral,1
55555,what does this mean?,ed23fr7,SNGULARITY,ENLIGHTENEDCENTRISM,27,neutral,neutral,neutral,neutral,1
54332,You go back to school... Except this time you ...,eeb063d,cchings,lostgeneration,5,neutral,neutral,neutral,neutral,1


In [11]:
# split the training data
from datasets import Dataset

training_data = dataset.groupby("level0").sample(frac=0.8, random_state=25) # stratified sampling
testing_data = dataset.drop(training_data.index)

training_data = Dataset.from_pandas(training_data) # create transformers compatible dataset from dataframe
testing_data = Dataset.from_pandas(testing_data)

def tokenize_function(examples): # replace representation of data, convert column text to tensor-based representation
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_training_data = training_data.map(tokenize_function, batched=True) # convert text to tensor form
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/803 [00:00<?, ? examples/s]

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

In [12]:
tokenized_training_data

Dataset({
    features: ['text', 'id', 'author', 'subreddit', 'rater_id', 'level0', 'level1', 'level2', 'level3', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 803
})

In [13]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

annoyance_example = random.sample(list(dataset.id[dataset.level0 == "annoyance"]), k=1) # example for annoyance
dataset.query('id==@annoyance_example')  


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label
50296,"I feel bad for her, talk to her :(",eeu5xfv,frazuri,oldpeoplefacebook,24,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis,9


In [14]:
desire_example = random.sample(list(dataset.id[dataset.level0 == "desire"]), k=1) # example for desire
dataset.query('id==@desire_example') 

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label
25560,I really wanted this to be a real sub.,ee5vknc,infinitemonkeytyping,holdmybeer,28,desire,des_opt,des_opt_car,des_opt_car,24


In [15]:
# check if data set is balanced
classCounts = dataset.level0.value_counts() 
print(classCounts)
# -> not balanced

level0
neutral           314
approval           77
admiration         62
disapproval        47
gratitude          46
annoyance          39
curiosity          36
anger              32
amusement          30
optimism           30
love               30
surprise           28
realization        28
sadness            27
disappointment     22
joy                22
disgust            20
caring             17
excitement         17
confusion          16
desire             15
fear               11
remorse            10
embarrassment       8
relief              6
pride               5
nervousness         4
grief               1
Name: count, dtype: int64


In [16]:
numberOfDocuments = len(dataset)
numberOfDocuments

1000

In [None]:
"""
#Classifier
from huggingface_hub import notebook_login
notebook_login()
"""

In [17]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Padding -> map all tensors to the same size
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [18]:
accuracy = evaluate.load("accuracy") # define evaluation method -> quality

In [19]:
def compute_metrics(eval_pred): # function calculation metric

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=28, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# training

training_args = TrainingArguments(
    output_dir="../models/model_level0",
    learning_rate=2e-5,  # standard
    per_device_train_batch_size=16, # size in which chunks are entered into the network, on how many data parallel weights are trained
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch", # save model per epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False #,
    #label_names=["level0"],
)

# IMPORTANT: Set: Model, dataset, ... , define learning process, metrics, ...

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_testing_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  
)

#checkpointing
#use cuda
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.767331,0.319797
2,No log,2.668823,0.319797
3,No log,2.573031,0.360406
4,No log,2.502936,0.385787
5,No log,2.483721,0.375635
6,No log,2.460483,0.350254
7,No log,2.443411,0.360406
8,No log,2.446507,0.365482
9,No log,2.453969,0.340102
10,2.248800,2.455822,0.345178


TrainOutput(global_step=510, training_loss=2.2406121871050666, metrics={'train_runtime': 224.2682, 'train_samples_per_second': 35.805, 'train_steps_per_second': 2.274, 'total_flos': 1064206426398720.0, 'train_loss': 2.2406121871050666, 'epoch': 10.0})

In [22]:
trainer.save_model("../models/model_level0_sample")

In [23]:
classifier = pipeline("text-classification", model="../models/model_level0_sample",device=0) # method pipeline -> sting for textclassificaton, folder, device (graphics card)
results = [classifier(text,truncation=True) for text in tqdm(dataset.text.to_list())] # listcomprehension over all texts, tokenization in model, truncation -> padding too long texts

100%|██████████| 1000/1000 [00:17<00:00, 58.15it/s]


In [24]:
results = [tmp[0] for tmp in results]

In [26]:
results # list of dictionaries with labels, score -> decision and how high activation function for decision was
pd.DataFrame(results).to_pickle("../results/results_level0_sample.pkl")  # convert as dataframe, pick, safe

### Evaluation

In [72]:
df_results = pd.DataFrame.from_dict(results)
df_id =  pd.DataFrame(dataset["id"])
df_id = df_id.reset_index()
df_results["id"] = df_id["id"]
df_results

Unnamed: 0,label,score,id
0,neutral,0.176830,eez4phj
1,neutral,0.703029,eeejsh7
2,neutral,0.079363,ed8auqx
3,neutral,0.420149,ee103qd
4,neutral,0.331734,efaudck
...,...,...,...
995,admiration,0.081957,eetak09
996,approval,0.110026,edlin87
997,neutral,0.214870,ed23fr7
998,neutral,0.735096,eeb063d


In [76]:
data_classifies = pd.merge(dataset, df_results, on='id', how='left')

# merge classified data with original training data
# combine data with training data, concatenate results results and training data
# compare -> calculate f1

In [28]:
data_classifies.to_pickle("../results/data_classified_level0_sample.pkl")  
# data_classifies = pd.read_pickle("../results/data_classified_level0_1epoch.pkl")

In [77]:
data_classifies # contain goldstandard and ergbnis cluster -> calculate F1, Precision, Recall
# label -> assigned by classifier (?????)
# level0 -> original label

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,label_x,label_y,score
0,"Oh, I always thought it was gum that had been ""pre-gooed.""",eez4phj,YuenHsiaoTieng,TheSimpsons,12,realization,app_rea,app_rea,pri_adm_gra_rel_app_rea,7,neutral,0.176830
1,She was using her lighter to heat up the cold pump handle.,eeejsh7,8549176320,instant_regret,37,neutral,neutral,neutral,neutral,1,neutral,0.703029
2,"everyone else managed to stop, he was just careless and too fast on the approach",ed8auqx,M9ow,yesyesyesyesno,62,remorse,rem_emb,rem_emb,rem_emb_dis_sad_gri,14,neutral,0.079363
3,people have built such a stigma around it that it's what comes to mind first instead of positive thoughts.,ee103qd,_xJRHNBRx_,gay,15,realization,app_rea,app_rea,pri_adm_gra_rel_app_rea,7,neutral,0.420149
4,"I think the hulk could do all of that at once, and fold the wrench with his finger tips",efaudck,sixpacksitdown,youseeingthisshit,40,approval,app_rea,app_rea,pri_adm_gra_rel_app_rea,16,neutral,0.331734
...,...,...,...,...,...,...,...,...,...,...,...,...
1015,"Those refs were terrible, god damn. GG Buckeyes, it was much closer than the score suggests.",eetak09,touchadafishy,CollegeBasketball,39,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,20,admiration,0.081957
1016,Oh well I guess. Can't blame you for that. We aren't all [NAME] though.,edlin87,TimesPlus,lewronggeneration,55,neutral,neutral,neutral,neutral,1,approval,0.110026
1017,what does this mean?,ed23fr7,SNGULARITY,ENLIGHTENEDCENTRISM,27,neutral,neutral,neutral,neutral,1,neutral,0.214870
1018,"You go back to school... Except this time you don't qualify for ANY financial aid, because you already have a degree",eeb063d,cchings,lostgeneration,5,neutral,neutral,neutral,neutral,1,neutral,0.735096


In [78]:
test_data = data_classifies.query(f'id in {tokenized_testing_data["id"]}')
# tokenized_testing_data: daten der Testdaten

In [83]:
target_names = ['sadness', 'neutral', 'love', 'gratitude', 'disapproval',
       'amusement', 'disappointment', 'realization', 'admiration',
       'annoyance', 'confusion', 'optimism', 'excitement', 'caring',
       'remorse', 'joy', 'approval', 'embarrassment', 'surprise',
       'curiosity', 'anger', 'grief', 'disgust', 'pride', 'desire',
       'relief', 'fear', 'nervousness']
print(classification_report(test_data.level0, test_data.label_y, target_names=target_names))
# level0 -> gold standard , label -> prediction

                precision    recall  f1-score   support

       sadness       0.25      0.75      0.38        12
          love       0.00      0.00      0.00         6
     gratitude       0.00      0.00      0.00         7
   disapproval       0.50      0.12      0.20         8
     amusement       0.09      0.07      0.08        15
disappointment       0.00      0.00      0.00         3
   realization       0.00      0.00      0.00         3
    admiration       0.09      0.14      0.11         7
     annoyance       0.00      0.00      0.00         3
     confusion       0.00      0.00      0.00         4
      optimism       0.33      0.11      0.17         9
    excitement       0.00      0.00      0.00         5
        caring       0.00      0.00      0.00         2
       remorse       0.00      0.00      0.00         3
           joy       0.00      0.00      0.00         2
      approval       0.40      0.89      0.55         9
 embarrassment       0.00      0.00      0.00  



In [84]:
pd.DataFrame.from_dict(results)

Unnamed: 0,label,score
0,neutral,0.176830
1,neutral,0.703029
2,neutral,0.079363
3,neutral,0.420149
4,neutral,0.331734
...,...,...
995,admiration,0.081957
996,approval,0.110026
997,neutral,0.214870
998,neutral,0.735096


In [87]:
#Final Classification/Viz
#final = pd.concat([dataset, pd.DataFrame.from_dict(results)],axis=1) # attach classified label to data
final = data_classifies.copy()
final['label_y'].value_counts()/final['label_y'].value_counts().sum() # ratio

label_y
neutral        0.534314
admiration     0.174510
gratitude      0.091176
approval       0.059804
curiosity      0.038235
disapproval    0.024510
anger          0.024510
amusement      0.019608
annoyance      0.015686
love           0.005882
sadness        0.004902
caring         0.003922
optimism       0.001961
disgust        0.000980
Name: count, dtype: float64