### Multi-Class Text Classification for Emotions using BERT

In [2]:
# ! pip install datasets huggingface_hub ipywidgets evaluate 'transformers[torch]' torch xformers plotnine sentencepiece

In [1]:
import numpy as np
import pandas as pd

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *

import random
import evaluate

from datasets import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from tqdm import tqdm

from plotnine import ggplot, aes, geom_tile, coord_flip,theme,geom_line,labs,element_text
from plotnine import scale_x_discrete,geom_vline


#### Get the data

In [2]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)
r, c = df_clean.shape
print(f"The data has {r} row and {c} columns")
df_clean.head(3)

The data has 171820 row and 33 columns


Unnamed: 0,text,id,author,subreddit,rater_id,admiration,amusement,anger,annoyance,approval,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
pivoted_df = create_pivoted_df(df_clean)
hierarchical_df = add_hierarchical_levels(pivoted_df)
r, c = hierarchical_df.shape
print(f"The data has {r} row and {c} columns")
hierarchical_df.head(3)

The data has 171820 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love


In [7]:
# use majority vote for texts with more than one label (from different raters) to only have one gold standard
majority_vote_df = majority_voted_df(hierarchical_df)
r, c = majority_vote_df.shape
print(f"The majority voted data has {r} row and {c} columns")

The majority voted data has 43379 row and 2 columns


In [8]:
clustered_df = hierarchical_df.merge(majority_vote_df, on=['id', 'level0'], how='inner')
majority_vote_df
r, c = clustered_df.shape
print(f"The data has {r} row and {c} columns")
clustered_df

The data has 93683 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
1,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,50,neutral,neutral,neutral,neutral,neutral
4,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love
...,...,...,...,...,...,...,...,...,...,...
93678,i hate that it's over an interstate bridge so ...,eetqysc,second_ary,houston,76,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage
93679,He called [NAME] to the Steelers and was outsp...,eed7qdq,sw337,steelers,35,neutral,neutral,neutral,neutral,neutral
93680,He called [NAME] to the Steelers and was outsp...,eed7qdq,sw337,steelers,36,neutral,neutral,neutral,neutral,neutral
93681,That guy who said that it's a good idea to nev...,edwrnhz,Plays-0-Cost-Cards,seduction,27,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration


In [9]:
# Keep only the rows with distinct values in the 'id' column
distinct_df = clustered_df.drop_duplicates(subset='id', keep='first')
r, c = distinct_df.shape
print(f"The data has {r} row and {c} columns")
distinct_df

The data has 43379 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
4,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love
8,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral
11,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe
...,...,...,...,...,...,...,...,...,...,...
93675,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,57,love,love,exc_joy_lov,amu_exc_joy_lov,love
93677,i hate that it's over an interstate bridge so ...,eetqysc,second_ary,houston,52,anger,ang_ann,dis_ang_ann,dis_ang_ann_dis,rage
93679,He called [NAME] to the Steelers and was outsp...,eed7qdq,sw337,steelers,35,neutral,neutral,neutral,neutral,neutral
93681,That guy who said that it's a good idea to nev...,edwrnhz,Plays-0-Cost-Cards,seduction,27,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration


In [10]:
# create a sample for tests
sample_df = distinct_df.groupby("level0").sample(frac=0.05, random_state=25) # stratified sampling
sample_df

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
7925,Or woman who would look beautiful without her ...,ed6um9g,jackalooz,WhitePeopleTwitter,35,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
61532,Looks like the nightingale armour from Sky rim...,ef5dv7t,LemonWedgeTheGuy,ExpectationVsReality,33,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
93659,[NAME] at least has a pretty face and great sk...,eezzs4q,badbatch,loveafterlockup,23,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
41278,"Real nice, swearing up a storm right in front ...",edw9jvd,Mochipants,My600lbLife,5,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
14092,omg [NAME] is so pretty,edcnd0l,tomfoolery_69,vanderpumprules,33,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
...,...,...,...,...,...,...,...,...,...,...
26931,Can't believe people like this actually exist.,eed5id6,iamfromLisbon,ChoosingBeggars,27,surprise,surprise,sur_cur_con,sur_cur_con,amazement
85926,Wow calm down,efctll7,dsadaOG,sadcringe,55,surprise,surprise,sur_cur_con,sur_cur_con,amazement
28013,I'm so surprised they haven't done it yet tbh!,edpcz7e,kittylovesblog,BeautyGuruChatter,15,surprise,surprise,sur_cur_con,sur_cur_con,amazement
21519,Thats beyond shockingly bad. A new level of te...,eejn7ai,WatsTatorsPrecious,DetroitPistons,67,surprise,surprise,sur_cur_con,sur_cur_con,amazement


In [11]:
# check if data set is balanced
classCounts_0 = sample_df.level0.value_counts() 
print(classCounts_0)

level0
neutral           873
admiration        155
gratitude         130
approval          103
amusement          93
love               81
disapproval        78
curiosity          71
annoyance          70
anger              56
confusion          47
optimism           47
joy                45
sadness            41
surprise           39
caring             35
disappointment     34
realization        30
disgust            26
excitement         25
fear               22
desire             21
remorse            20
embarrassment      11
nervousness         5
relief              5
pride               3
grief               2
Name: count, dtype: int64


In [12]:
backtranslated_df = backtranslated_df(sample_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.35M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/344M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/832k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
backtranslated_df

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,Or woman who would look beautiful without her ...,ed6um9g,jackalooz,WhitePeopleTwitter,35,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
1,Looks like the nightingale armour from Sky rim...,ef5dv7t,LemonWedgeTheGuy,ExpectationVsReality,33,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
2,[NAME] at least has a pretty face and great sk...,eezzs4q,badbatch,loveafterlockup,23,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
3,"Real nice, swearing up a storm right in front ...",edw9jvd,Mochipants,My600lbLife,5,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
4,omg [NAME] is so pretty,edcnd0l,tomfoolery_69,vanderpumprules,33,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration
...,...,...,...,...,...,...,...,...,...,...
2237,"<unk> nta spiritotons, the miglior contratto d...",ef54wy8_it,DraziBlack,leafs,11,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
2238,I'm sorry about your grandmother.,ednta6j_nl,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
2239,"He was great, the best contract ever, so sad w...",ef54wy8_nl,DraziBlack,leafs,11,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
2240,"»■ tavalla Damn iloinen, että voitE.",ednta6j_fi,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief


In [28]:
src_model_name = ["Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-en-da", "Helsinki-NLP/opus-mt-en-sv", "Helsinki-NLP/opus-mt-en-ru", "Helsinki-NLP/opus-mt-en-id", "Helsinki-NLP/opus-mt-en-nl", "Helsinki-NLP/opus-mt-en-cs"]
tgt_model_name = ["Helsinki-NLP/opus-mt-fr-en", "Helsinki-NLP/opus-mt-de-en", "Helsinki-NLP/opus-mt-es-en", "Helsinki-NLP/opus-mt-da-en", "Helsinki-NLP/opus-mt-sv-en", "Helsinki-NLP/opus-mt-ru-en", "Helsinki-NLP/opus-mt-id-en", "Helsinki-NLP/opus-mt-nl-en", "Helsinki-NLP/opus-mt-cs-en"]
language_short = ["_fr", "_de", "_es", "_da", "_sv", "_ru", "_id", "_nl", "_cs"]
nr = 3
example = bracktranslate_emo(sample_df[sample_df['level0'] == 'grief'], language_short[nr], src_model_name[nr], tgt_model_name[nr])
example

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
85165,I'm glad you're feeling better.,ednta6j_da,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
15929,"He was just amazing, the best contract ever, s...",ef54wy8_da,DraziBlack,leafs,11,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief


In [14]:
# check if data set is balanced
classCounts_0 = backtranslated_df.level0.value_counts() 
print(classCounts_0)

level0
neutral           873
admiration        155
gratitude         130
approval          103
amusement          93
love               81
disapproval        78
curiosity          71
annoyance          70
anger              56
confusion          47
optimism           47
joy                45
sadness            41
surprise           39
caring             35
disappointment     34
realization        30
disgust            26
excitement         25
embarrassment      22
fear               22
desire             21
grief              20
nervousness        20
relief             20
remorse            20
pride              18
Name: count, dtype: int64


### BERT for level 0 -> 27 emotions
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
clustered_df.level0.unique()

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # no differentiation between upper and lower case

In [None]:
id2label = {0: 'sadness', 1: 'neutral', 2: 'love', 3: 'gratitude', 4: 'disapproval',
       5: 'amusement', 6: 'disappointment', 7: 'realization', 8: 'admiration', 9:
       'annoyance', 10: 'confusion', 11: 'optimism', 12: 'excitement', 13: 'caring',
       14: 'remorse', 15: 'joy', 16: 'approval', 17: 'embarrassment', 18: 'surprise',
       19: 'curiosity', 20: 'anger', 21: 'grief', 22: 'disgust', 23: 'pride', 24: 'desire',
       25: 'relief', 26: 'fear', 27: 'nervousness'}
label2id = {'sadness': 0, 'neutral': 1, 'love': 2, 'gratitude': 3, 'disapproval': 4,
       'amusement': 5, 'disappointment': 6, 'realization': 7, 'admiration': 8,
       'annoyance': 9, 'confusion': 10, 'optimism': 11, 'excitement': 12, 'caring': 13,
       'remorse': 14, 'joy': 15, 'approval': 16, 'embarrassment': 17, 'surprise': 18,
       'curiosity': 19, 'anger': 20, 'grief': 21, 'disgust': 22, 'pride': 23, 'desire': 24,
       'relief': 25, 'fear': 26, 'nervousness': 27}

In [None]:
# create a sample for lokal tests
dataset = clustered_df.sample(n=1000, replace=False, random_state=123)
dataset["label"] = dataset["level0"].map(label2id.get) # to add column label map individual entries of emotions to ID
dataset

In [None]:
# split the training data
from datasets import Dataset

training_data = dataset.groupby("level0").sample(frac=0.8, random_state=25) # stratified sampling
testing_data = dataset.drop(training_data.index)

training_data = Dataset.from_pandas(training_data) # create transformers compatible dataset from dataframe
testing_data = Dataset.from_pandas(testing_data)

def tokenize_function(examples): # replace representation of data, convert column text to tensor-based representation
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_training_data = training_data.map(tokenize_function, batched=True) # convert text to tensor form
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

In [None]:
tokenized_training_data

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

annoyance_example = random.sample(list(dataset.id[dataset.level0 == "annoyance"]), k=1) # example for annoyance
dataset.query('id==@annoyance_example')  


In [None]:
desire_example = random.sample(list(dataset.id[dataset.level0 == "desire"]), k=1) # example for desire
dataset.query('id==@desire_example') 

In [49]:
# check if data set is balanced
classCounts = distinct_df.level0.value_counts() 
print(classCounts)
# -> not balanced

neutral           17458
admiration         3102
gratitude          2595
approval           2058
amusement          1857
love               1624
disapproval        1563
curiosity          1411
annoyance          1408
anger              1121
optimism            943
confusion           941
joy                 895
sadness             829
surprise            777
caring              704
disappointment      673
realization         602
disgust             514
excitement          503
fear                433
desire              420
remorse             406
embarrassment       229
relief              103
nervousness          99
pride                69
grief                42
Name: level0, dtype: int64


In [None]:
numberOfDocuments = len(dataset)
numberOfDocuments

In [None]:
"""
#Classifier
from huggingface_hub import notebook_login
notebook_login()
"""

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Padding -> map all tensors to the same size
data_collator

In [None]:
accuracy = evaluate.load("accuracy") # define evaluation method -> quality

In [None]:
def compute_metrics(eval_pred): # function calculation metric

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=28, id2label=id2label, label2id=label2id
)

In [None]:
# training

training_args = TrainingArguments(
    output_dir="../models/model_level0",
    learning_rate=2e-5,  # standard
    per_device_train_batch_size=16, # size in which chunks are entered into the network, on how many data parallel weights are trained
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch", # save model per epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False #,
    #label_names=["level0"],
)

# IMPORTANT: Set: Model, dataset, ... , define learning process, metrics, ...

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_testing_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  
)

#checkpointing
#use cuda
trainer.train()

In [None]:
trainer.save_model("../models/model_level0_sample")

In [None]:
classifier = pipeline("text-classification", model="../models/model_level0_sample",device=0) # method pipeline -> sting for textclassificaton, folder, device (graphics card)
results = [classifier(text,truncation=True) for text in tqdm(dataset.text.to_list())] # listcomprehension over all texts, tokenization in model, truncation -> padding too long texts

In [None]:
results = [tmp[0] for tmp in results]

In [None]:
results # list of dictionaries with labels, score -> decision and how high activation function for decision was
pd.DataFrame(results).to_pickle("../results/results_level0_sample.pkl")  # convert as dataframe, pick, safe

### Evaluation

In [None]:
df_results = pd.DataFrame.from_dict(results)
df_id =  pd.DataFrame(dataset["id"])
df_id = df_id.reset_index()
df_results["id"] = df_id["id"]
df_results

In [None]:
data_classifies = pd.merge(dataset, df_results, on='id', how='left')

# merge classified data with original training data
# combine data with training data, concatenate results results and training data
# compare -> calculate f1

In [None]:
data_classifies.to_pickle("../results/data_classified_level0_sample.pkl")  
# data_classifies = pd.read_pickle("../results/data_classified_level0_1epoch.pkl")

In [None]:
data_classifies # contain goldstandard and ergbnis cluster -> calculate F1, Precision, Recall
# label -> assigned by classifier (?????)
# level0 -> original label

In [None]:
test_data = data_classifies.query(f'id in {tokenized_testing_data["id"]}')
# tokenized_testing_data: daten der Testdaten

In [None]:
target_names = ['sadness', 'neutral', 'love', 'gratitude', 'disapproval',
       'amusement', 'disappointment', 'realization', 'admiration',
       'annoyance', 'confusion', 'optimism', 'excitement', 'caring',
       'remorse', 'joy', 'approval', 'embarrassment', 'surprise',
       'curiosity', 'anger', 'grief', 'disgust', 'pride', 'desire',
       'relief', 'fear', 'nervousness']
print(classification_report(test_data.level0, test_data.label_y, target_names=target_names))
# level0 -> gold standard , label -> prediction

In [None]:
pd.DataFrame.from_dict(results)

In [None]:
#Final Classification/Viz
#final = pd.concat([dataset, pd.DataFrame.from_dict(results)],axis=1) # attach classified label to data
final = data_classifies.copy()
final['label_y'].value_counts()/final['label_y'].value_counts().sum() # ratio

### Translation for backtranslation

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# two pre-trained translation models: source language and target language
src_model_name = ["Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-en-zh", "Helsinki-NLP/opus-mt-en-vi", "Helsinki-NLP/opus-mt-en-ru", "Helsinki-NLP/opus-mt-en-it", "Helsinki-NLP/opus-mt-en-nl", "Helsinki-NLP/opus-mt-en-fi"]
tgt_model_name = ["Helsinki-NLP/opus-mt-fr-en", "Helsinki-NLP/opus-mt-de-en", "Helsinki-NLP/opus-mt-es-en", "Helsinki-NLP/opus-mt-zh-en", "Helsinki-NLP/opus-mt-vi-en", "Helsinki-NLP/opus-mt-ru-en", "Helsinki-NLP/opus-mt-it-en", "Helsinki-NLP/opus-mt-nl-en", "Helsinki-NLP/opus-mt-fi-en"]
language_short = ["_fr", "_de", "_es", "_zh", "_vi", "_ru", "_it", "_nl", "_fi"]

language_nr = 8

src_tokenizer = AutoTokenizer.from_pretrained(src_model_name[language_nr])
src_model = AutoModelForSeq2SeqLM.from_pretrained(src_model_name[language_nr])

tgt_tokenizer = AutoTokenizer.from_pretrained(tgt_model_name[language_nr])
tgt_model = AutoModelForSeq2SeqLM.from_pretrained(tgt_model_name[language_nr])


In [None]:
def bracktranslate_emo(df, language, src_model_name, tgt_model_name):

    src_tokenizer = AutoTokenizer.from_pretrained(src_model_name)
    src_model = AutoModelForSeq2SeqLM.from_pretrained(src_model_name)

    tgt_tokenizer = AutoTokenizer.from_pretrained(tgt_model_name)
    tgt_model = AutoModelForSeq2SeqLM.from_pretrained(tgt_model_name)

    def backtranslate(text, src_tokenizer, src_model, tgt_tokenizer, tgt_model):
        # Translate source text to the target language
        src_input = src_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        tgt_translation = src_model.generate(**src_input)
        tgt_translation_text = tgt_tokenizer.decode(tgt_translation[0], skip_special_tokens=True)
        
        # Translate target translation back to the source language
        tgt_input = tgt_tokenizer(tgt_translation_text, return_tensors="pt", padding=True, truncation=True)
        src_backtranslation = tgt_model.generate(**tgt_input)
        src_backtranslation_text = src_tokenizer.decode(src_backtranslation[0], skip_special_tokens=True)
        
        return src_backtranslation_text #, tgt_translation_text

    # Apply backtranslation to the 'text' column
    df['text'] = df['text'].apply(lambda x: backtranslate(x, src_tokenizer, src_model, tgt_tokenizer, tgt_model))

    #remove ▁ from subword tokenization
    df['text'] = df['text'].str.replace("▁", " ")

    # Add "_fr" to the id column for backtranslated rows
    df['id'] = df['id'] + language
    
    return df


In [None]:
def backtranslate(text, src_tokenizer, src_model, tgt_tokenizer, tgt_model):
    # Translate source text to the target language
    src_input = src_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    tgt_translation = src_model.generate(**src_input)
    tgt_translation_text = tgt_tokenizer.decode(tgt_translation[0], skip_special_tokens=True)
    
    # Translate target translation back to the source language
    tgt_input = tgt_tokenizer(tgt_translation_text, return_tensors="pt", padding=True, truncation=True)
    src_backtranslation = tgt_model.generate(**tgt_input)
    src_backtranslation_text = src_tokenizer.decode(src_backtranslation[0], skip_special_tokens=True)
    
    return src_backtranslation_text #, tgt_translation_text


In [None]:
# Sample DataFrame (replace this with your actual DataFrame)
data = {
    'text': ["That game hurt.", "You do right, if you don't care then fuck 'em!", "Man I love reddit."],
    'id': ["eew5j0j", "ed2mah1", "eeibobj"],
    'author': ["Brdd9", "Labalool", "MrsRobertshaw"],
    'subreddit': ["nrl", "confessions", "facepalm"],
    'rater_id': [1, 37, 18],
    'level0': ["sadness", "neutral", "love"],
    'level1': ["dis_sad", "neutral", "love"],
    'level2': ["dis_sad_gri", "neutral", "exc_joy_lov"],
    'level3': ["rem_emb_dis_sad_gri", "neutral", "amu_exc_joy_lov"],
    'plutchik': ["grief", "neutral", "love"]
}

df = pd.DataFrame(data)

In [None]:
# Apply backtranslation to the 'text' column
df['backtranslated_text'] = df['text'].apply(lambda x: backtranslate(x, src_tokenizer, src_model, tgt_tokenizer, tgt_model))

#remove ▁ from subword tokenization
df['backtranslated_text'] = df['backtranslated_text'].str.replace("▁", " ")

# Display the resulting DataFrame
df


In [None]:
# Filter texts with "sadness" as level0
sadness_texts = df[df['level0'] == 'sadness']
# Add "_fr" to the id column for backtranslated rows
sadness_texts['id'] = sadness_texts['id'] + "_fr"
sadness_texts

In [None]:
# language_short = ["_fr", "_de", "_es", "_zh", "_vi", "_ru", "_it", "_nl", "_fi"]

"""
embarrassment       229
relief              103
nervousness          99
pride                69
grief                42
"""

In [None]:
embarrassment_fr = bracktranslate_emo(distinct_df['level0'] == 'embarrassment', language_short[0], src_model_name[0], tgt_model_name[0])

In [None]:
relief_fr = bracktranslate_emo(distinct_df['level0'] == 'relief', language_short[0], src_model_name[0], tgt_model_name[0])
relief_de = bracktranslate_emo(distinct_df['level0'] == 'relief', language_short[1], src_model_name[1], tgt_model_name[1])
relief_es = bracktranslate_emo(distinct_df['level0'] == 'relief', language_short[2], src_model_name[2], tgt_model_name[2])

In [None]:
nervousness_fr = bracktranslate_emo(distinct_df['level0'] == 'nervousness', language_short[0], src_model_name[0], tgt_model_name[0])
nervousness_de = bracktranslate_emo(distinct_df['level0'] == 'nervousness', language_short[1], src_model_name[1], tgt_model_name[1])
nervousness_es = bracktranslate_emo(distinct_df['level0'] == 'nervousness', language_short[2], src_model_name[2], tgt_model_name[2])

In [None]:
pride_fr = bracktranslate_emo(distinct_df['level0'] == 'pride', language_short[0], src_model_name[0], tgt_model_name[0])
pride_de = bracktranslate_emo(distinct_df['level0'] == 'pride', language_short[1], src_model_name[1], tgt_model_name[1])
pride_es = bracktranslate_emo(distinct_df['level0'] == 'pride', language_short[2], src_model_name[2], tgt_model_name[2])
pride_zh = bracktranslate_emo(distinct_df['level0'] == 'pride', language_short[3], src_model_name[3], tgt_model_name[3])
pride_vi = bracktranslate_emo(distinct_df['level0'] == 'pride', language_short[4], src_model_name[4], tgt_model_name[4])

In [None]:
grief_fr = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[0], src_model_name[0], tgt_model_name[0])
grief_de = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[1], src_model_name[1], tgt_model_name[1])
grief_es = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[2], src_model_name[2], tgt_model_name[2])
grief_zh = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[3], src_model_name[3], tgt_model_name[3])
grief_vi = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[4], src_model_name[4], tgt_model_name[4])
grief_ru = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[5], src_model_name[5], tgt_model_name[5])
grief_it = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[6], src_model_name[6], tgt_model_name[6])
grief_nl = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[7], src_model_name[7], tgt_model_name[7])
grief_fi = bracktranslate_emo(distinct_df['level0'] == 'grief', language_short[8], src_model_name[8], tgt_model_name[8])

### Looking at the Results

In [4]:
level0_report = pd.read_csv("../results/bert_base_cased/model_level0_report.csv")
level0_report

Unnamed: 0.1,Unnamed: 0,precision,recall,f1-score,support
0,admiration,0.672464,0.748387,0.708397,620.0
1,amusement,0.69526,0.830189,0.756757,371.0
2,anger,0.444954,0.433036,0.438914,224.0
3,annoyance,0.329412,0.198582,0.247788,282.0
4,approval,0.465217,0.259709,0.333333,412.0
5,caring,0.410072,0.404255,0.407143,141.0
6,confusion,0.524138,0.404255,0.456456,188.0
7,curiosity,0.516026,0.570922,0.542088,282.0
8,desire,0.615385,0.571429,0.592593,84.0
9,disappointment,0.290323,0.2,0.236842,135.0
