### Multi-Class Text Classification for Emotions using BERT

In [None]:
# ! pip install datasets huggingface_hub ipywidgets evaluate 'transformers[torch]' torch xformers plotnine

In [1]:
import numpy as np
import pandas as pd

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *
from preprocessing.bert_func import *

import random
import evaluate

from datasets import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from tqdm import tqdm

from plotnine import ggplot, aes, geom_tile, coord_flip,theme,geom_line,labs,element_text
from plotnine import scale_x_discrete,geom_vline


#### Get the data

In [2]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)

In [3]:
r, c = df_clean.shape
print(f"The data has {r} row and {c} columns")
df_clean.head(3)

The data has 171820 row and 33 columns


Unnamed: 0,text,id,author,subreddit,rater_id,admiration,amusement,anger,annoyance,approval,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
clustered_df = create_df_all_labels(df_clean)

In [5]:

r, c = clustered_df.shape
print(f"The data has {r} row and {c} columns")
clustered_df.head(3)

The data has 171820 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love


In [None]:
"""
#Classifier
from huggingface_hub import notebook_login
notebook_login()
"""

In [None]:
bert = "bert-base-cased"
models_dir = "../models/bert_base_cased/"
results_dir = "../results/bert_base_cased/"

### BERT for level 0 -> 27 +1 emotions
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

annoyance_example = random.sample(list(clustered_df.id[clustered_df.level0 == "annoyance"]), k=1) # example for annoyance
clustered_df.query('id==@annoyance_example')  

In [None]:
desire_example = random.sample(list(clustered_df.id[clustered_df.level0 == "desire"]), k=1) # example for desire
clustered_df.query('id==@desire_example')

In [12]:
# check if data set is balanced
classCounts_0 = clustered_df.level0.value_counts() 
print(classCounts_0)
# -> not balanced

neutral           55298
approval          11259
admiration        10531
annoyance          8342
disapproval        7686
gratitude          7075
amusement          6130
curiosity          5885
anger              5202
love               4957
confusion          4938
realization        4714
disappointment     4706
optimism           4519
joy                4329
sadness            3827
caring             3523
surprise           3472
excitement         3020
disgust            2914
desire             2147
fear               1778
remorse            1510
embarrassment      1433
nervousness         796
relief              788
pride               690
grief               351
Name: level0, dtype: int64


In [13]:
numberOfDocuments_0 = len(clustered_df)
numberOfDocuments_0

171820

In [None]:
dataset_0, results_0, tokenized_testing_data_0 = get_bert(clustered_df, "level0", bert, models_dir, results_dir)

#### Evaluation

In [72]:
df_results_0 = pd.DataFrame.from_dict(results_0)
df_id_0 =  pd.DataFrame(dataset_0["id"])
df_id_0 = df_id_0.reset_index()
df_results_0["id"] = df_id_0["id"]
df_results_0

Unnamed: 0,label,score,id
0,neutral,0.176830,eez4phj
1,neutral,0.703029,eeejsh7
2,neutral,0.079363,ed8auqx
3,neutral,0.420149,ee103qd
4,neutral,0.331734,efaudck
...,...,...,...
995,admiration,0.081957,eetak09
996,approval,0.110026,edlin87
997,neutral,0.214870,ed23fr7
998,neutral,0.735096,eeb063d


In [76]:
data_classifies_0 = pd.merge(dataset_0, df_results_0, on='id', how='left') # merge classified data with original training data
data_classifies_0.to_pickle(results_dir + "data_classified_level0.pkl")  
data_classifies_0 # contain sgoldstandard and cluster of results

In [78]:
test_data_0 = data_classifies_0.query(f'id in {tokenized_testing_data_0["id"]}')
# tokenized_testing_data: daten der Testdaten

In [83]:
target_names_0 = clustered_df.level0.unique().tolist()
print(classification_report(test_data_0.level0, test_data_0.label_y, target_names=target_names_0))
# level0 -> gold standard , label -> prediction

                precision    recall  f1-score   support

       sadness       0.25      0.75      0.38        12
          love       0.00      0.00      0.00         6
     gratitude       0.00      0.00      0.00         7
   disapproval       0.50      0.12      0.20         8
     amusement       0.09      0.07      0.08        15
disappointment       0.00      0.00      0.00         3
   realization       0.00      0.00      0.00         3
    admiration       0.09      0.14      0.11         7
     annoyance       0.00      0.00      0.00         3
     confusion       0.00      0.00      0.00         4
      optimism       0.33      0.11      0.17         9
    excitement       0.00      0.00      0.00         5
        caring       0.00      0.00      0.00         2
       remorse       0.00      0.00      0.00         3
           joy       0.00      0.00      0.00         2
      approval       0.40      0.89      0.55         9
 embarrassment       0.00      0.00      0.00  



In [87]:
#Final Classification/Viz

final_0 = data_classifies_0.copy()
final_0['label_y'].value_counts()/final_0['label_y'].value_counts().sum() # ratio

label_y
neutral        0.534314
admiration     0.174510
gratitude      0.091176
approval       0.059804
curiosity      0.038235
disapproval    0.024510
anger          0.024510
amusement      0.019608
annoyance      0.015686
love           0.005882
sadness        0.004902
caring         0.003922
optimism       0.001961
disgust        0.000980
Name: count, dtype: float64

### BERT for level 1 -> 17 + 1 emotions

In [32]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_example = random.sample(list(clustered_df.id[clustered_df.level1 == "dis_sad"]), k=1) # example for annoyance
clustered_df.query('id==@dis_sad_example')  


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3
47072,and [NAME] got blessed with a LocationBot cat fact?! There is no justice in the world...,eefk2mt,themaskedserpent,bestoflegaladvice,2,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri
49953,and [NAME] got blessed with a LocationBot cat fact?! There is no justice in the world...,eefk2mt,themaskedserpent,bestoflegaladvice,5,disappointment,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri


In [30]:
# check if data set is balanced
classCounts_1 = clustered_df.level1.value_counts() 
print(classCounts_1)
# -> not balanced

neutral          55298
app_rea          15973
ang_ann          13544
pri_adm          11221
cur_con          10823
dis_sad           8533
gra_rel           7863
disapproval       7686
exc_joy           7349
des_opt           6666
amusement         6130
love              4957
caring            3523
surprise          3472
disgust           2914
fea_ner           2574
rem_emb           1510
embarrassment     1433
grief              351
Name: level1, dtype: int64


In [31]:
numberOfDocuments_1 = len(clustered_df)
numberOfDocuments_1

171820

In [None]:
dataset_1, results_1, tokenized_testing_data_1 = get_bert(clustered_df, "level1", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_1 = pd.DataFrame.from_dict(results_1)
df_id_1 =  pd.DataFrame(dataset_1["id"])
df_id_1 = df_id_1.reset_index()
df_results_1["id"] = df_id_1["id"]
df_results_1

In [None]:
data_classifies_1 = pd.merge(dataset_1, df_results_1, on='id', how='left') # merge classified data with original training data
data_classifies_1.to_pickle(results_dir +"data_classified_level1.pkl") # save
data_classifies_1 # contains goldstandard and cluster of results -> calculate F1, Precision, Recall

In [None]:
test_data_1 = data_classifies_1.query(f'id in {tokenized_testing_data_1["id"]}')

In [None]:
target_names_1 = clustered_df.level1.unique().tolist()
print(classification_report(test_data_1.level1, test_data_1.label_y, target_names=target_names_1))
# level1 -> gold standard , label -> prediction

In [None]:
#Final Classification/Viz
#final = pd.concat([dataset, pd.DataFrame.from_dict(results)],axis=1) # attach classified label to data
final_1 = data_classifies_1.copy()
final_1['label_y'].value_counts()/final_1['label_y'].value_counts().sum() # ratio

### BERT for level 2 -> 11 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_gri_example = random.sample(list(clustered_df.id[clustered_df.level2 == "dis_sad_gri"]), k=1) # example for annoyance
clustered_df.query('id==@dis_sad_gri_example')

In [None]:
# check if data set is balanced
classCounts_2 = clustered_df.level2.value_counts() 
print(classCounts_2)
# -> not balanced

In [None]:
numberOfDocuments_2 = len(clustered_df)
numberOfDocuments_2

In [None]:
dataset_2, results_2, tokenized_testing_data_2 = get_bert(clustered_df, "level2", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_2 = pd.DataFrame.from_dict(results_2)
df_id_2 =  pd.DataFrame(dataset_2["id"])
df_id_2 = df_id_2.reset_index()
df_results_2["id"] = df_id_2["id"]
df_results_2

In [None]:
data_classifies_2 = pd.merge(dataset_2, df_results_2, on='id', how='left')
data_classifies_2.to_pickle(results_dir +"data_classified_level2.pkl")  
data_classifies_2

In [None]:
test_data_2 = data_classifies_2.query(f'id in {tokenized_testing_data_2["id"]}')
# tokenized_testing_data: daten der Testdaten

In [None]:
target_names_2 = clustered_df.level2.unique().tolist()
print(classification_report(test_data_2.level2, test_data_2.label_y, target_names=target_names_2))

In [None]:
#Final Classification/Viz

final_2 = data_classifies_2.copy()
final_2['label_y'].value_counts()/final_2['label_y'].value_counts().sum() # ratio

### BERT for level 3 -> 7 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

rem_emb_dis_sad_gri_example = random.sample(list(clustered_df.id[plutchik_df.plutchik == "rem_emb_dis_sad_gri"]), k=1) # example for annoyance
plutchik_df.query('id==@rem_emb_dis_sad_gri_example')

In [None]:
# check if data set is balanced
classCounts_3 = clustered_df.level3.value_counts() 
print(classCounts_3)
# -> not balanced

In [None]:
numberOfDocuments_3 = len(clustered_df)
numberOfDocuments_3

In [None]:
dataset_3, results_3, tokenized_testing_data_3 = get_bert(clustered_df, "level3", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_3 = pd.DataFrame.from_dict(results_3)
df_id_3 =  pd.DataFrame(dataset_3["id"])
df_id_3 = df_id_3.reset_index()
df_results_3["id"] = df_id_3["id"]
df_results_3

In [None]:
data_classifies_3 = pd.merge(dataset_3, df_results_3, on='id', how='left')
data_classifies_3.to_pickle(results_dir +"data_classified_level3.pkl")  
data_classifies_3

In [None]:
test_data_3 = data_classifies_3.query(f'id in {tokenized_testing_data_3["id"]}')

In [None]:
target_names_3 = clustered_df.level3.unique().tolist()
print(classification_report(test_data_3.level0, test_data_3.label_y, target_names=target_names_3))

In [None]:
#Final Classification/Viz

final_3 = data_classifies_3.copy()
final_3['label_y'].value_counts()/final_3['label_y'].value_counts().sum() # ratio

### BERT for plutchik -> 14 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

grief_example = random.sample(list(clustered_df.id[clustered_df.plutchik == "grief"]), k=1) # example for annoyance
clustered_df.query('id==@grief_example')

In [None]:
# check if data set is balanced
classCounts_p = clustered_df.plutchik.value_counts() 
print(classCounts_p)
# -> not balanced

In [None]:
numberOfDocuments_p = len(clustered_df)
numberOfDocuments_p

In [None]:
dataset_p, results_p, tokenized_testing_data_p = get_bert(clustered_df, "plutchik", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_p = pd.DataFrame.from_dict(results_p)
df_id_p =  pd.DataFrame(dataset_p["id"])
df_id_p = df_id_p.reset_index()
df_results_p["id"] = df_id_p["id"]
df_results_p

In [None]:
data_classifies_p = pd.merge(dataset_p, df_results_p, on='id', how='left')
data_classifies_p.to_pickle(results_dir +"data_classified_plutchik.pkl")  
data_classifies_p

In [None]:
test_data_p = data_classifies_p.query(f'id in {tokenized_testing_data_p["id"]}')

In [None]:
target_names_p = clustered_df.plutchik.unique().tolist()
print(classification_report(test_data_p.plutchik, test_data_p.label_y, target_names=target_names_p))

In [None]:
#Final Classification/Viz

final_p = data_classifies_p.copy()
final_p['label_y'].value_counts()/final_p['label_y'].value_counts().sum() # ratio