### Multi-Class Text Classification for Emotions using BERT

In [None]:
# ! pip install datasets huggingface_hub ipywidgets evaluate 'transformers[torch]' torch xformers plotnine

In [1]:
import numpy as np
import pandas as pd

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *
from training.bert_func import *

import random
import evaluate

from datasets import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from tqdm import tqdm

from plotnine import ggplot, aes, geom_tile, coord_flip,theme,geom_line,labs,element_text
from plotnine import scale_x_discrete,geom_vline


#### Get the data

In [2]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)

In [3]:
r, c = df_clean.shape
print(f"The data has {r} row and {c} columns")
df_clean.head(3)

The data has 171820 row and 33 columns


Unnamed: 0,text,id,author,subreddit,rater_id,admiration,amusement,anger,annoyance,approval,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
pivoted_df = create_pivoted_df(df_clean)
hierarchical_df = add_hierarchical_levels(pivoted_df)

In [7]:
r, c = hierarchical_df.shape
print(f"The data has {r} row and {c} columns")
hierarchical_df.head(3)

The data has 171820 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love


In [8]:
# use majority vote for texts with more than one label (from different raters) to only have one gold standard
majority_vote_df = majority_voted_df(hierarchical_df)
r, c = majority_vote_df.shape
print(f"The majority voted data has {r} row and {c} columns")

clustered_df = hierarchical_df.merge(majority_vote_df, on=['id', 'level0'], how='inner')

The majority voted data has 43379 row and 2 columns


In [9]:
r, c = clustered_df.shape
print(f"The data has {r} row and {c} columns")
clustered_df

The data has 93683 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
1,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,50,neutral,neutral,neutral,neutral,neutral
4,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love
...,...,...,...,...,...,...,...,...,...,...
93678,i hate that it's over an interstate bridge so ...,eetqysc,second_ary,houston,76,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage
93679,He called [NAME] to the Steelers and was outsp...,eed7qdq,sw337,steelers,35,neutral,neutral,neutral,neutral,neutral
93680,He called [NAME] to the Steelers and was outsp...,eed7qdq,sw337,steelers,36,neutral,neutral,neutral,neutral,neutral
93681,That guy who said that it's a good idea to nev...,edwrnhz,Plays-0-Cost-Cards,seduction,27,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration


In [20]:
# create a sample for tests
sample_clustered_df = clustered_df.sample(n=1000, replace=False, random_state=123)

In [None]:
"""
#Classifier
from huggingface_hub import notebook_login
notebook_login()
"""

In [10]:
bert = "bert-base-cased"
models_dir = "../models/bert_base_cased/"
results_dir = "../results/bert_base_cased/"

### BERT for level 0 -> 27 +1 emotions
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [11]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

annoyance_example = random.sample(list(clustered_df.id[clustered_df.level0 == "annoyance"]), k=1) # example for annoyance
clustered_df.query('id==@annoyance_example')  

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
40932,"Play stupid games, win stupid prizes.",ed6v0kb,BoredDuccReddit,HadToHurt,4,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage
40933,"Play stupid games, win stupid prizes.",ed6v0kb,BoredDuccReddit,HadToHurt,60,annoyance,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage


In [12]:
desire_example = random.sample(list(clustered_df.id[clustered_df.level0 == "desire"]), k=1) # example for desire
clustered_df.query('id==@desire_example')

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
20310,i wish i could be an ex someone wanted to get back with.,edv4ef9,Tobor_Yllems,OkCupid,51,desire,des_opt,des_opt_car,des_opt_car,love
20311,i wish i could be an ex someone wanted to get back with.,edv4ef9,Tobor_Yllems,OkCupid,18,desire,des_opt,des_opt_car,des_opt_car,love


In [13]:
# check if data set is balanced
classCounts_0 = clustered_df.level0.value_counts() 
print(classCounts_0)
# -> not balanced

level0
neutral           40230
admiration         6653
gratitude          5657
approval           4148
amusement          4028
love               3533
disapproval        3147
curiosity          3044
annoyance          2755
anger              2248
confusion          1893
optimism           1885
joy                1743
sadness            1655
surprise           1590
caring             1404
disappointment     1322
realization        1171
disgust            1022
excitement         1000
fear                889
desire              850
remorse             784
embarrassment       448
relief              194
nervousness         186
pride               126
grief                78
Name: count, dtype: int64


In [14]:
numberOfDocuments_0 = len(clustered_df)
numberOfDocuments_0

93683

In [None]:
dataset_0, results_0, tokenized_testing_data_0 = get_bert(clustered_df, "level0", bert, models_dir, results_dir)

Map:   0%|          | 0/74944 [00:00<?, ? examples/s]

Map:   0%|          | 0/18739 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9869,0.916371,0.73339
2,0.6475,0.696397,0.810609
3,0.3664,0.592896,0.859704
4,0.2236,0.62538,0.878542
5,0.1463,0.637337,0.891456


 59%|█████▉    | 55735/93683 [10:35<07:12, 87.66it/s]

In [15]:
from transformers import AutoModelForSequenceClassification

# Replace 'models_dir' and 'level' with your actual paths and level
model_path = models_dir + "model_level0"

# Load the saved model
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)


In [19]:
id2label = {0: 'sadness', 1: 'neutral', 2: 'love', 3: 'gratitude', 4: 'disapproval',
                    5: 'amusement', 6: 'disappointment', 7: 'realization', 8: 'admiration', 9:
                    'annoyance', 10: 'confusion', 11: 'optimism', 12: 'excitement', 13: 'caring',
                    14: 'remorse', 15: 'joy', 16: 'approval', 17: 'embarrassment', 18: 'surprise',
                    19: 'curiosity', 20: 'anger', 21: 'grief', 22: 'disgust', 23: 'pride', 24: 'desire',
                    25: 'relief', 26: 'fear', 27: 'nervousness'}
label2id = {value: key for key, value in id2label.items()}

tokenizer = AutoTokenizer.from_pretrained(bert)

dataset = clustered_df.copy()
dataset["label"] = dataset["level0"].map(label2id.get)

training_data = dataset.groupby("level0").sample(frac=0.8, random_state=25) # stratified sampling
testing_data = dataset.drop(training_data.index)

training_data = Dataset.from_pandas(training_data) # create transformers compatible dataset from dataframe
testing_data = Dataset.from_pandas(testing_data)

def tokenize_function(examples): # replace representation of data, convert column text to tensor-based representation
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_training_data = training_data.map(tokenize_function, batched=True) # convert text to tensor form
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Padding -> map all tensors to the same size
accuracy = evaluate.load("accuracy") # define evaluation method -> quality

def compute_metrics(eval_pred): # function calculation metric
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
    
classifier = pipeline("text-classification", model=models_dir+"model_level0", device=0) # method pipeline -> sting for textclassificaton, folder, device (graphics card)
results = [classifier(text,truncation=True) for text in tqdm(dataset.text.to_list())] # listcomprehension over all texts, tokenization in model, truncation -> padding too long texts

results = [tmp[0] for tmp in results]
pd.DataFrame(results).to_pickle(results_dir+"results_level0.pkl")  # convert as dataframe, pick, safe

Map:   0%|          | 0/74944 [00:00<?, ? examples/s]

Map:   0%|          | 0/18739 [00:00<?, ? examples/s]

100%|██████████| 93683/93683 [18:20<00:00, 85.12it/s]


In [20]:
dataset_0 = dataset
results_0 = results
tokenized_testing_data_0 = tokenized_testing_data

#### Evaluation

In [31]:
pd.DataFrame(testing_data)

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label,__index_level_0__
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,0
1,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,81,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,5
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,2,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,7
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral,1,8
4,BUT IT'S HER TURN! /s,ef7tl7i,Genesis2001,SandersForPresident,17,neutral,neutral,neutral,neutral,neutral,1,17
...,...,...,...,...,...,...,...,...,...,...,...,...
18734,Oh [NAME] i forgot about those. I used to love them.,ed0052l,Simbabz,australia,32,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,93667
18735,"Thanks, [NAME]",edirq0m,iFacialedanOrphan,unpopularopinion,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,93672
18736,"Thanks, [NAME]",edirq0m,iFacialedanOrphan,unpopularopinion,70,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,93673
18737,i hate that it's over an interstate bridge so people traveling from the west from other states have to look at it edit* because it's fucking corny,eetqysc,second_ary,houston,52,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage,20,93677


In [37]:
clustered_df

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
1,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,50,neutral,neutral,neutral,neutral,neutral
4,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love
...,...,...,...,...,...,...,...,...,...,...
93678,i hate that it's over an interstate bridge so people traveling from the west from other states have to look at it edit* because it's fucking corny,eetqysc,second_ary,houston,76,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage
93679,He called [NAME] to the Steelers and was outspoken with how good the Juju pick was.,eed7qdq,sw337,steelers,35,neutral,neutral,neutral,neutral,neutral
93680,He called [NAME] to the Steelers and was outspoken with how good the Juju pick was.,eed7qdq,sw337,steelers,36,neutral,neutral,neutral,neutral,neutral
93681,"That guy who said that it's a good idea to never smile towards women you like? Pass, the book was ultra-overrated",edwrnhz,Plays-0-Cost-Cards,seduction,27,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration


In [38]:
df_results_0 = pd.DataFrame.from_dict(results_0)
df_id_0 =  pd.DataFrame(dataset_0["id"])
df_id_0 = df_id_0.reset_index()
df_results_0["id"] = df_id_0["id"]
df_results_0

Unnamed: 0,label,score
0,sadness,0.979451
1,sadness,0.979451
2,neutral,0.945702
3,neutral,0.945702
4,love,0.996631
...,...,...
93678,annoyance,0.576295
93679,neutral,0.990888
93680,neutral,0.990888
93681,admiration,0.957383


In [22]:
data_classifies_0 = pd.merge(dataset_0, df_results_0, on='id', how='left') # merge classified data with original training data
data_classifies_0.to_pickle(results_dir + "data_classified_level0.pkl")  
data_classifies_0 # contain sgoldstandard and cluster of results

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,label_y,score
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
1,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
2,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
3,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
4,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,neutral,0.945702
...,...,...,...,...,...,...,...,...,...,...,...,...,...
221468,He called [NAME] to the Steelers and was outspoken with how good the Juju pick was.,eed7qdq,sw337,steelers,35,neutral,neutral,neutral,neutral,neutral,1,neutral,0.990888
221469,He called [NAME] to the Steelers and was outspoken with how good the Juju pick was.,eed7qdq,sw337,steelers,36,neutral,neutral,neutral,neutral,neutral,1,neutral,0.990888
221470,He called [NAME] to the Steelers and was outspoken with how good the Juju pick was.,eed7qdq,sw337,steelers,36,neutral,neutral,neutral,neutral,neutral,1,neutral,0.990888
221471,"That guy who said that it's a good idea to never smile towards women you like? Pass, the book was ultra-overrated",edwrnhz,Plays-0-Cost-Cards,seduction,27,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration,8,admiration,0.957383


In [28]:
test_data_0 = data_classifies_0.query(f'id in {tokenized_testing_data_0["id"]}')
# tokenized_testing_data: daten der Testdaten
test_data_0

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,label_y,score
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
1,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
2,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
3,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.979451
8,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,love,0.996631
...,...,...,...,...,...,...,...,...,...,...,...,...,...
221458,"Thanks, [NAME]",edirq0m,iFacialedanOrphan,unpopularopinion,51,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,gratitude,0.996808
221463,i hate that it's over an interstate bridge so people traveling from the west from other states have to look at it edit* because it's fucking corny,eetqysc,second_ary,houston,52,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage,20,annoyance,0.576295
221464,i hate that it's over an interstate bridge so people traveling from the west from other states have to look at it edit* because it's fucking corny,eetqysc,second_ary,houston,52,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage,20,annoyance,0.576295
221465,i hate that it's over an interstate bridge so people traveling from the west from other states have to look at it edit* because it's fucking corny,eetqysc,second_ary,houston,76,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage,20,annoyance,0.576295


In [27]:
target_names_0 = test_data_0.level0.unique().tolist()
print(classification_report(test_data_0.level0, test_data_0.label_y, target_names=target_names_0))
# level0 -> gold standard , label -> prediction

                precision    recall  f1-score   support

       sadness       0.94      0.95      0.95      6744
          love       0.92      0.98      0.95      4147
       neutral       0.86      0.86      0.86      2020
    excitement       0.76      0.75      0.76      2225
        caring       0.83      0.86      0.84      3629
       remorse       0.78      0.91      0.84      1211
           joy       0.82      0.85      0.83      1668
     gratitude       0.83      0.93      0.88      2925
      optimism       0.94      0.83      0.88       742
      surprise       0.86      0.69      0.76      1130
     curiosity       0.89      0.86      0.87      2676
     amusement       0.82      0.88      0.85       902
   disapproval       0.96      0.88      0.92       387
    admiration       0.85      0.79      0.82       953
     annoyance       0.91      0.95      0.93       844
   realization       0.98      0.98      0.98      6797
      approval       0.95      0.32      0.48  

In [25]:
#Final Classification/Viz

final_0 = data_classifies_0.copy()
final_0['label_y'].value_counts()/final_0['label_y'].value_counts().sum() # ratio

label_y
neutral           0.447427
admiration        0.071837
gratitude         0.065340
amusement         0.045518
approval          0.041572
love              0.039946
curiosity         0.034072
disapproval       0.030261
annoyance         0.025231
anger             0.022499
confusion         0.019203
optimism          0.017045
sadness           0.016336
surprise          0.016327
joy               0.016101
caring            0.015347
disappointment    0.011369
disgust           0.010417
realization       0.010250
fear              0.009414
excitement        0.009319
remorse           0.008525
desire            0.007929
embarrassment     0.004028
relief            0.001671
nervousness       0.001671
pride             0.000993
grief             0.000352
Name: count, dtype: float64

### BERT for level 1 -> 17 + 1 emotions

In [32]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_example = random.sample(list(clustered_df.id[clustered_df.level1 == "dis_sad"]), k=1) # example for annoyance
clustered_df.query('id==@dis_sad_example')  


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3
47072,and [NAME] got blessed with a LocationBot cat fact?! There is no justice in the world...,eefk2mt,themaskedserpent,bestoflegaladvice,2,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri
49953,and [NAME] got blessed with a LocationBot cat fact?! There is no justice in the world...,eefk2mt,themaskedserpent,bestoflegaladvice,5,disappointment,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri


In [30]:
# check if data set is balanced
classCounts_1 = clustered_df.level1.value_counts() 
print(classCounts_1)
# -> not balanced

neutral          55298
app_rea          15973
ang_ann          13544
pri_adm          11221
cur_con          10823
dis_sad           8533
gra_rel           7863
disapproval       7686
exc_joy           7349
des_opt           6666
amusement         6130
love              4957
caring            3523
surprise          3472
disgust           2914
fea_ner           2574
rem_emb           1510
embarrassment     1433
grief              351
Name: level1, dtype: int64


In [31]:
numberOfDocuments_1 = len(clustered_df)
numberOfDocuments_1

171820

In [None]:
dataset_1, results_1, tokenized_testing_data_1 = get_bert(clustered_df, "level1", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_1 = pd.DataFrame.from_dict(results_1)
df_id_1 =  pd.DataFrame(dataset_1["id"])
df_id_1 = df_id_1.reset_index()
df_results_1["id"] = df_id_1["id"]
df_results_1

In [None]:
data_classifies_1 = pd.merge(dataset_1, df_results_1, on='id', how='left') # merge classified data with original training data
data_classifies_1.to_pickle(results_dir +"data_classified_level1.pkl") # save
data_classifies_1 # contains goldstandard and cluster of results -> calculate F1, Precision, Recall

In [None]:
test_data_1 = data_classifies_1.query(f'id in {tokenized_testing_data_1["id"]}')

In [None]:
target_names_1 = test_data_1.level1.unique().tolist()
print(classification_report(test_data_1.level1, test_data_1.label_y, target_names=target_names_1))
# level1 -> gold standard , label -> prediction

In [None]:
#Final Classification/Viz
#final = pd.concat([dataset, pd.DataFrame.from_dict(results)],axis=1) # attach classified label to data
final_1 = data_classifies_1.copy()
final_1['label_y'].value_counts()/final_1['label_y'].value_counts().sum() # ratio

### BERT for level 2 -> 11 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_gri_example = random.sample(list(clustered_df.id[clustered_df.level2 == "dis_sad_gri"]), k=1) # example for annoyance
clustered_df.query('id==@dis_sad_gri_example')

In [None]:
# check if data set is balanced
classCounts_2 = clustered_df.level2.value_counts() 
print(classCounts_2)
# -> not balanced

In [None]:
numberOfDocuments_2 = len(clustered_df)
numberOfDocuments_2

In [None]:
dataset_2, results_2, tokenized_testing_data_2 = get_bert(clustered_df, "level2", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_2 = pd.DataFrame.from_dict(results_2)
df_id_2 =  pd.DataFrame(dataset_2["id"])
df_id_2 = df_id_2.reset_index()
df_results_2["id"] = df_id_2["id"]
df_results_2

In [None]:
data_classifies_2 = pd.merge(dataset_2, df_results_2, on='id', how='left')
data_classifies_2.to_pickle(results_dir +"data_classified_level2.pkl")  
data_classifies_2

In [None]:
test_data_2 = data_classifies_2.query(f'id in {tokenized_testing_data_2["id"]}')
# tokenized_testing_data: daten der Testdaten

In [None]:
target_names_2 = test_data_2.level2.unique().tolist()
print(classification_report(test_data_2.level2, test_data_2.label_y, target_names=target_names_2))

In [None]:
#Final Classification/Viz

final_2 = data_classifies_2.copy()
final_2['label_y'].value_counts()/final_2['label_y'].value_counts().sum() # ratio

### BERT for level 3 -> 7 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

rem_emb_dis_sad_gri_example = random.sample(list(clustered_df.id[plutchik_df.plutchik == "rem_emb_dis_sad_gri"]), k=1) # example for annoyance
plutchik_df.query('id==@rem_emb_dis_sad_gri_example')

In [None]:
# check if data set is balanced
classCounts_3 = clustered_df.level3.value_counts() 
print(classCounts_3)
# -> not balanced

In [None]:
numberOfDocuments_3 = len(clustered_df)
numberOfDocuments_3

In [None]:
dataset_3, results_3, tokenized_testing_data_3 = get_bert(clustered_df, "level3", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_3 = pd.DataFrame.from_dict(results_3)
df_id_3 =  pd.DataFrame(dataset_3["id"])
df_id_3 = df_id_3.reset_index()
df_results_3["id"] = df_id_3["id"]
df_results_3

In [None]:
data_classifies_3 = pd.merge(dataset_3, df_results_3, on='id', how='left')
data_classifies_3.to_pickle(results_dir +"data_classified_level3.pkl")  
data_classifies_3

In [None]:
test_data_3 = data_classifies_3.query(f'id in {tokenized_testing_data_3["id"]}')

In [None]:
target_names_3 = test_data_3.level3.unique().tolist()
print(classification_report(test_data_3.level0, test_data_3.label_y, target_names=target_names_3))

In [None]:
#Final Classification/Viz

final_3 = data_classifies_3.copy()
final_3['label_y'].value_counts()/final_3['label_y'].value_counts().sum() # ratio

### BERT for plutchik -> 14 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

grief_example = random.sample(list(clustered_df.id[clustered_df.plutchik == "grief"]), k=1) # example for annoyance
clustered_df.query('id==@grief_example')

In [None]:
# check if data set is balanced
classCounts_p = clustered_df.plutchik.value_counts() 
print(classCounts_p)
# -> not balanced

In [None]:
numberOfDocuments_p = len(clustered_df)
numberOfDocuments_p

In [None]:
dataset_p, results_p, tokenized_testing_data_p = get_bert(clustered_df, "plutchik", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_p = pd.DataFrame.from_dict(results_p)
df_id_p =  pd.DataFrame(dataset_p["id"])
df_id_p = df_id_p.reset_index()
df_results_p["id"] = df_id_p["id"]
df_results_p

In [None]:
data_classifies_p = pd.merge(dataset_p, df_results_p, on='id', how='left')
data_classifies_p.to_pickle(results_dir +"data_classified_plutchik.pkl")  
data_classifies_p

In [None]:
test_data_p = data_classifies_p.query(f'id in {tokenized_testing_data_p["id"]}')

In [None]:
target_names_p = test_data_p.plutchik.unique().tolist()
print(classification_report(test_data_p.plutchik, test_data_p.label_y, target_names=target_names_p))

In [None]:
#Final Classification/Viz

final_p = data_classifies_p.copy()
final_p['label_y'].value_counts()/final_p['label_y'].value_counts().sum() # ratio