### Multi-Class Text Classification for Emotions using BERT

In [1]:
# ! pip install datasets huggingface_hub ipywidgets evaluate 'transformers[torch]' torch xformers plotnine sentencepiece

In [10]:
import numpy as np
import pandas as pd

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *
from training.bert_func import *

import random
import evaluate

from datasets import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from tqdm import tqdm

from plotnine import ggplot, aes, geom_tile, coord_flip,theme,geom_line,labs,element_text
from plotnine import scale_x_discrete,geom_vline

#### Get the data

In [11]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)
r, c = df_clean.shape
print(f"The data has {r} row and {c} columns")
df_clean.head(3)

The data has 171820 row and 33 columns


Unnamed: 0,text,id,author,subreddit,rater_id,admiration,amusement,anger,annoyance,approval,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
pivoted_df = create_pivoted_df(df_clean)
hierarchical_df = add_hierarchical_levels(pivoted_df)
r, c = hierarchical_df.shape
print(f"The data has {r} row and {c} columns")
hierarchical_df.head(3)

The data has 171820 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love


In [13]:
# use majority vote for texts with more than one label (from different raters) to only have one gold standard
majority_vote_df = majority_voted_df(hierarchical_df)
r, c = majority_vote_df.shape
print(f"The majority voted data has {r} row and {c} columns")

The majority voted data has 43379 row and 2 columns


In [14]:
clustered_df = hierarchical_df.merge(majority_vote_df, on=['id', 'level0'], how='inner')
majority_vote_df
r, c = clustered_df.shape
print(f"The data has {r} row and {c} columns")
clustered_df.head(3)

The data has 93683 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
1,That game hurt.,eew5j0j,Brdd9,nrl,52,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral


In [15]:
# Keep only the rows with distinct values in the 'id' column
distinct_df = clustered_df.drop_duplicates(subset='id', keep='first')
r, c = distinct_df.shape
print(f"The data has {r} row and {c} columns")
distinct_df.head(3)

The data has 43379 row and 10 columns


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
4,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love


In [8]:
"""
# create a sample for tests
sample_df = distinct_df.groupby("level0").sample(frac=0.05, random_state=25) # stratified sampling
sample_df
"""

'\n# create a sample for tests\nsample_df = distinct_df.groupby("level0").sample(frac=0.05, random_state=25) # stratified sampling\nsample_df\n'

In [10]:
# check if data set is balanced
classCounts_0 = distinct_df.level0.value_counts() 
print(classCounts_0)

level0
neutral           17458
admiration         3102
gratitude          2595
approval           2058
amusement          1857
love               1624
disapproval        1563
curiosity          1411
annoyance          1408
anger              1121
optimism            943
confusion           941
joy                 895
sadness             829
surprise            777
caring              704
disappointment      673
realization         602
disgust             514
excitement          503
fear                433
desire              420
remorse             406
embarrassment       229
relief              103
nervousness          99
pride                69
grief                42
Name: count, dtype: int64


In [9]:
# two pre-trained translation models: source language and target language
src_model_name = ["Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-en-da", "Helsinki-NLP/opus-mt-en-sv", "Helsinki-NLP/opus-mt-en-ru", "Helsinki-NLP/opus-mt-en-id", "Helsinki-NLP/opus-mt-en-nl", "Helsinki-NLP/opus-mt-en-cs"]
tgt_model_name = ["Helsinki-NLP/opus-mt-fr-en", "Helsinki-NLP/opus-mt-de-en", "Helsinki-NLP/opus-mt-es-en", "Helsinki-NLP/opus-mt-da-en", "Helsinki-NLP/opus-mt-sv-en", "Helsinki-NLP/opus-mt-ru-en", "Helsinki-NLP/opus-mt-id-en", "Helsinki-NLP/opus-mt-nl-en", "Helsinki-NLP/opus-mt-cs-en"]
language_short = ["_fr", "_de", "_es", "_da", "_sv", "_ru", "_id", "_nl", "_cs"]
df = distinct_df.copy()

In [10]:
    # Create Backtranslation and concatenate DataFrames
embarrassment_fr = bracktranslate_emo(df[df['level0'] == 'embarrassment'], language_short[0], src_model_name[0], tgt_model_name[0])
result_df = pd.concat([df, embarrassment_fr], ignore_index=True)

result_df.to_csv('../data/backtranslated_df.csv', index=False)  # save dataframe as csv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
relief_fr = bracktranslate_emo(df[df['level0'] == 'relief'], language_short[0], src_model_name[0], tgt_model_name[0])
result_df = pd.concat([result_df, relief_fr], ignore_index=True)
relief_de = bracktranslate_emo(df[df['level0'] == 'relief'], language_short[1], src_model_name[1], tgt_model_name[1])
result_df = pd.concat([result_df, relief_de], ignore_index=True)
relief_es = bracktranslate_emo(df[df['level0'] == 'relief'], language_short[2], src_model_name[2], tgt_model_name[2])
result_df = pd.concat([result_df, relief_es], ignore_index=True)

result_df.to_csv('../data/backtranslated_df.csv', index=False)  # save dataframe as csv



In [None]:
nervousness_fr = bracktranslate_emo(df[df['level0'] == 'nervousness'], language_short[0], src_model_name[0], tgt_model_name[0])
result_df = pd.concat([result_df, nervousness_fr], ignore_index=True)
nervousness_de = bracktranslate_emo(df[df['level0'] == 'nervousness'], language_short[1], src_model_name[1], tgt_model_name[1])
result_df = pd.concat([result_df, nervousness_de], ignore_index=True)
nervousness_es = bracktranslate_emo(df[df['level0'] == 'nervousness'], language_short[2], src_model_name[2], tgt_model_name[2])
result_df = pd.concat([result_df, nervousness_es], ignore_index=True)
    
result_df.to_csv('../data/backtranslated_df.csv', index=False)  # save dataframe as csv

In [None]:
pride_fr = bracktranslate_emo(df[df['level0'] == 'pride'], language_short[0], src_model_name[0], tgt_model_name[0])
result_df = pd.concat([result_df, pride_fr], ignore_index=True)
pride_de = bracktranslate_emo(df[df['level0'] == 'pride'], language_short[1], src_model_name[1], tgt_model_name[1])
result_df = pd.concat([result_df, pride_de], ignore_index=True)
pride_es = bracktranslate_emo(df[df['level0'] == 'pride'], language_short[2], src_model_name[2], tgt_model_name[2])
result_df = pd.concat([result_df, pride_es], ignore_index=True)
pride_da = bracktranslate_emo(df[df['level0'] == 'pride'], language_short[3], src_model_name[3], tgt_model_name[3])
result_df = pd.concat([result_df, pride_da], ignore_index=True)
pride_sv = bracktranslate_emo(df[df['level0'] == 'pride'], language_short[4], src_model_name[4], tgt_model_name[4])
result_df = pd.concat([result_df, pride_sv], ignore_index=True)
   
result_df.to_csv('../data/backtranslated_df.csv', index=False)  # save dataframe as csv

In [None]:
grief_fr = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[0], src_model_name[0], tgt_model_name[0])
result_df = pd.concat([result_df, grief_fr], ignore_index=True)
grief_de = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[1], src_model_name[1], tgt_model_name[1])
result_df = pd.concat([result_df, grief_de], ignore_index=True)
grief_es = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[2], src_model_name[2], tgt_model_name[2])
result_df = pd.concat([result_df, grief_es], ignore_index=True)
grief_da = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[3], src_model_name[3], tgt_model_name[3])
result_df = pd.concat([result_df, grief_da], ignore_index=True)
grief_sv = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[4], src_model_name[4], tgt_model_name[4])
result_df = pd.concat([result_df, grief_sv], ignore_index=True)
grief_ru = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[5], src_model_name[5], tgt_model_name[5])
result_df = pd.concat([result_df, grief_ru], ignore_index=True)
grief_id = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[6], src_model_name[6], tgt_model_name[6])
result_df = pd.concat([result_df, grief_id], ignore_index=True)
grief_nl = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[7], src_model_name[7], tgt_model_name[7])
result_df = pd.concat([result_df, grief_nl], ignore_index=True)
grief_cs = bracktranslate_emo(df[df['level0'] == 'grief'], language_short[8], src_model_name[8], tgt_model_name[8])
result_df = pd.concat([result_df, grief_cs], ignore_index=True)

result_df.to_csv('../data/backtranslated_df.csv', index=False)  # save dataframe as csv

In [16]:
result_df = pd.read_csv("../data/backtranslated_df.csv")
result_df

# result_df.to_csv('../data/backtranslated_df2.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)  # save dataframe as csv

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe
...,...,...,...,...,...,...,...,...,...,...
44932,I regret this contribution so many debates and anger lol,ef9n3ld_cs,NinjaMoleRat,ComedyCemetery,40,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
44933,You're gonna miss a begging old man asking for a spare coin.,eeq2udw_cs,indy6548,reddeadredemption2,49,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
44934,"It's been two hours, OP died for a noble cause.",edp46n9_cs,-CROFL-,tifu,49,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief
44935,Your uncle sounds fun.,eefvuvg_cs,deputy_dingdong,ProtectAndServe,3,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief


In [11]:
# check if data set is balanced
classCounts_0 = distinct_df.level0.value_counts() 
print(classCounts_0)

level0
neutral           17458
admiration         3102
gratitude          2595
approval           2058
amusement          1857
love               1624
disapproval        1563
curiosity          1411
annoyance          1408
anger              1121
optimism            943
confusion           941
joy                 895
sadness             829
surprise            777
caring              704
disappointment      673
realization         602
disgust             514
excitement          503
fear                433
desire              420
remorse             406
embarrassment       229
relief              103
nervousness          99
pride                69
grief                42
Name: count, dtype: int64


In [12]:
# check if data set is balanced
classCounts_0 = result_df.level0.value_counts() 
print(classCounts_0)

level0
neutral           17458
admiration         3102
gratitude          2595
approval           2058
amusement          1857
love               1624
disapproval        1563
curiosity          1411
annoyance          1408
anger              1121
optimism            943
confusion           941
joy                 895
sadness             829
surprise            777
caring              704
disappointment      673
realization         602
disgust             514
excitement          503
embarrassment       458
fear                433
desire              420
grief               420
pride               414
relief              412
remorse             406
nervousness         396
Name: count, dtype: int64


In [17]:
distinct_backtranslated_df = result_df.drop_duplicates(subset=['text'], keep='first')
distinct_backtranslated_df.head(3)

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love


In [14]:
# check if data set is balanced
classCounts_0 = distinct_backtranslated_df.level0.value_counts() 
print(classCounts_0)

level0
neutral           17408
admiration         3087
gratitude          2532
approval           2052
amusement          1854
love               1596
disapproval        1560
curiosity          1407
annoyance          1407
anger              1116
optimism            939
confusion           938
joy                 889
sadness             827
surprise            774
caring              699
disappointment      671
realization         602
disgust             510
excitement          497
embarrassment       450
fear                431
desire              420
remorse             405
relief              382
nervousness         373
grief               359
pride               346
Name: count, dtype: int64


In [18]:
bert = "bert-base-cased"
models_dir = "../models/bert_base_cased/backtranslated/"
results_dir = "../results/bert_base_cased/backtranslated/"

### BERT for level 0 -> 27 emotions on backtranslated data
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [15]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"
sadness_example = random.sample(list(distinct_backtranslated_df.id[distinct_backtranslated_df.level0 == "sadness"]), k=1) # example for sadness
distinct_backtranslated_df.query('id==@sadness_example')

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
23684,I’m so sorry to hear that! 😂😂😂,efb22py,matt46255,oldpeoplefacebook,37,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief


In [16]:
# check if data set is balanced
classCounts_0 = distinct_backtranslated_df.level0.value_counts()
print(classCounts_0)
# -> not balanced

level0
neutral           17408
admiration         3087
gratitude          2532
approval           2052
amusement          1854
love               1596
disapproval        1560
curiosity          1407
annoyance          1407
anger              1116
optimism            939
confusion           938
joy                 889
sadness             827
surprise            774
caring              699
disappointment      671
realization         602
disgust             510
excitement          497
embarrassment       450
fear                431
desire              420
remorse             405
relief              382
nervousness         373
grief               359
pride               346
Name: count, dtype: int64


In [17]:
numberOfDocuments_0 = len(distinct_backtranslated_df)
numberOfDocuments_0

44531

In [20]:
dataset_0, results_0, tokenized_testing_data_0, testing_data_0, label2id_0 = get_bert(distinct_backtranslated_df, "level0", bert, models_dir, results_dir)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Map:   0%|          | 0/35627 [00:00<?, ? examples/s]

Map:   0%|          | 0/8904 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4137,1.293608,0.634434
2,1.0935,1.246798,0.636905
3,0.83,1.283874,0.634996


100%|██████████| 44531/44531 [08:31<00:00, 87.01it/s]


In [21]:
# check if data set is balanced
classCounts_0 = pd.DataFrame(testing_data_0).level0.value_counts() 
print(classCounts_0)
# -> not balanced

level0
neutral           3482
admiration         617
gratitude          506
approval           410
amusement          371
love               319
disapproval        312
annoyance          281
curiosity          281
anger              223
optimism           188
confusion          188
joy                178
sadness            165
surprise           155
caring             140
disappointment     134
realization        120
disgust            102
excitement          99
embarrassment       90
fear                86
desire              84
remorse             81
relief              76
nervousness         75
grief               72
pride               69
Name: count, dtype: int64


#### Evaluation

In [22]:
df_results_0 = pd.DataFrame.from_dict(results_0)
df_id_0 =  pd.DataFrame(dataset_0["id"])
df_id_0 = df_id_0.reset_index()
df_results_0["id"] = df_id_0["id"]
df_results_0

Unnamed: 0,label,score,id
0,sadness,0.812212,eew5j0j
1,anger,0.333172,ed2mah1
2,love,0.969891,eeibobj
3,neutral,0.961064,eda6yn6
4,gratitude,0.947168,eespn2i
...,...,...,...
44526,grief,0.870773,edrcnmk_cs
44527,grief,0.633948,eesyrax_cs
44528,grief,0.372783,ednta6j_cs
44529,remorse,0.492712,ef9n3ld_cs


In [23]:
data_classifies_0 = pd.merge(dataset_0, df_results_0, on='id', how='left') # merge classified data with original training data
data_classifies_0.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_0["LABEL_pred_num"] = data_classifies_0["LABEL_pred"].map(label2id_0.get)
data_classifies_0.to_pickle(results_dir + "data_classified_level0.pkl") 
data_classifies_0

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,sadness,0.812212,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,anger,0.333172,20
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,love,0.969891,2
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral,1,neutral,0.961064,1
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,gratitude,0.947168,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44526,Hmm. Today I'm going to listen to death and destruction.,edrcnmk_cs,zane_lame,weezer,24,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.870773,21
44527,And turning someone's hands into spaghetti is cruel and inhumane and not sensible punishment for stealing.,eesyrax_cs,AgentPaper0,HadToHurt,62,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.633948,21
44528,I'm glad you're doing a little better. I'm sorry about Grandma. Good luck.,ednta6j_cs,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.372783,21
44529,I regret this contribution so many debates and anger lol,ef9n3ld_cs,NinjaMoleRat,ComedyCemetery,40,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,remorse,0.492712,14


In [None]:
"""
df_results_0d = pd.DataFrame.from_dict(results_0d)
# df_results_0d["id"] = pd.DataFrame(dataset_0d["id"])
df_results_0d.rename(columns={'label': 'LABEL'}, inplace=True)
df_results_0d

data_classifies_0d = pd.concat([dataset_0d,df_results_0d],axis=1) # merge classified data with original training data
data_classifies_0d.to_pickle(results_dir +"data_classified_level0.pkl") # save
data_classifies_0d # contains goldstandard and cluster of results -> calculate F1, Precision, Recall
"""

In [24]:
test_data_0 = data_classifies_0.query(f'id in {tokenized_testing_data_0["id"]}')
test_data_0

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
7,BUT IT'S HER TURN! /s,ef7tl7i,Genesis2001,SandersForPresident,17,neutral,neutral,neutral,neutral,neutral,1,neutral,0.709331,1
22,By far the coolest thing I've seen on this thread yet,edm3k6w,W8nd3rW8man,popping,78,joy,exc_joy,exc_joy_lov,amu_exc_joy_lov,ecstasy,15,admiration,0.752058,8
31,Gotta protect’em!,ed7dq81,Mvm321,AnimalsBeingBros,34,caring,caring,des_opt_car,des_opt_car,admiration,13,neutral,0.632656,1
46,I didn't like the family ending but I liked the Atlantis ending especially as we know we are going to see more of it in DLC2.,ee6mb6z,[deleted],assassinscreed,56,approval,app_rea,app_rea,pri_adm_gra_rel_app_rea,admiration,16,admiration,0.308320,8
51,My fans on patreon will be rewarded soon,ef8s6ku,[deleted],nrl,33,neutral,neutral,neutral,neutral,neutral,1,neutral,0.390220,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44509,"Unfortunately, I don't think half the people watching the game will hear about it.",edgcyz2_cs,Ezo31202,chicagobulls,55,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.749865,21
44517,"Oh, my God, put that away.",edj2o0u_cs,sahali735,90DayFiance,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.700538,21
44524,"I'm really sorry for your loss, my friend, I hope everything's all right on your side.",edcpzgq_cs,SwampBollocks,mentalhealth,3,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.455582,21
44525,I'm sorry your wife suffered a tragic injury.,ee23ppc_cs,bideaweebaby,DeadBedrooms,15,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,21,grief,0.485596,21


In [25]:
r, c = pd.DataFrame(testing_data_0).shape
print(f"The test data has {r} row and {c} columns")
report_0 = pd.DataFrame(classification_report(test_data_0.level0, test_data_0.LABEL_pred, output_dict=True)).transpose()
report_0.to_csv(results_dir + "model_level0_report.csv")
print(report_0)
# level0 -> gold standard , LABEL_pred -> prediction

The test data has 8904 row and 12 columns
                precision    recall  f1-score      support
admiration       0.656381  0.758509  0.703759   617.000000
amusement        0.724537  0.843666  0.779577   371.000000
anger            0.500000  0.497758  0.498876   223.000000
annoyance        0.344828  0.177936  0.234742   281.000000
approval         0.490741  0.258537  0.338658   410.000000
caring           0.443182  0.278571  0.342105   140.000000
confusion        0.424051  0.356383  0.387283   188.000000
curiosity        0.382413  0.665480  0.485714   281.000000
desire           0.637681  0.523810  0.575163    84.000000
disappointment   0.379310  0.164179  0.229167   134.000000
disapproval      0.487685  0.317308  0.384466   312.000000
disgust          0.569620  0.441176  0.497238   102.000000
embarrassment    0.697368  0.588889  0.638554    90.000000
excitement       0.518519  0.282828  0.366013    99.000000
fear             0.700000  0.651163  0.674699    86.000000
gratitude     

In [26]:
#Final Classification/Viz

final_0 = pd.DataFrame(test_data_0['LABEL_pred'].value_counts()/test_data_0['LABEL_pred'].value_counts().sum()) # ratio
final_0.to_csv(results_dir + "model_level0_testdata_frequency.csv")
print(final_0.shape)
final_0

(28, 1)


Unnamed: 0_level_0,count
LABEL_pred,Unnamed: 1_level_1
neutral,0.419924
admiration,0.080076
gratitude,0.057165
curiosity,0.054919
amusement,0.048518
love,0.045822
anger,0.024933
approval,0.024259
disapproval,0.022799
surprise,0.021676


### BERT for level 1 -> 17 + 1 emotions

In [14]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_example = random.sample(list(distinct_backtranslated_df.id[distinct_backtranslated_df.level1 == "dis_sad"]), k=1) # example for dis_sad
distinct_backtranslated_df.query('id==@dis_sad_example')  


Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
42518,She must have a very boring life,efald82,Sniper_Chicken_,terriblefacebookmemes,3,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief


In [15]:
# check if data set is balanced
classCounts_1 = distinct_backtranslated_df.level1.value_counts() 
print(classCounts_1)
# -> not balanced

level1
neutral        17408
pri_adm         3433
gra_rel         2914
app_rea         2654
ang_ann         2523
cur_con         2345
amusement       1854
love            1596
disapproval     1560
dis_sad         1498
exc_joy         1386
des_opt         1359
rem_emb          855
fea_ner          804
surprise         774
caring           699
disgust          510
grief            359
Name: count, dtype: int64


In [16]:
numberOfDocuments_1 = len(distinct_backtranslated_df)
numberOfDocuments_1

44531

In [17]:
dataset_1, results_1, tokenized_testing_data_1, testing_data_1, label2id_1 = get_bert(distinct_backtranslated_df, "level1", bert, models_dir, results_dir)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Map:   0%|          | 0/35622 [00:00<?, ? examples/s]

Map:   0%|          | 0/8909 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2013,1.142087,0.645415
2,0.9325,1.106961,0.653047
3,0.7109,1.181114,0.643282


100%|██████████| 44531/44531 [08:44<00:00, 84.92it/s]


#### Evaluation

In [None]:
df_results_1 = pd.DataFrame.from_dict(results_1)
df_id_1 =  pd.DataFrame(dataset_1["id"])
df_id_1 = df_id_1.reset_index()
df_results_1["id"] = df_id_1["id"]
df_results_1

Unnamed: 0,label,score,id
0,dis_sad,0.888890,eew5j0j
1,ang_ann,0.700729,ed2mah1
2,love,0.965565,eeibobj
3,neutral,0.974597,eda6yn6
4,gra_rel,0.971078,eespn2i
...,...,...,...
44526,grief,0.874987,edrcnmk_cs
44527,grief,0.631932,eesyrax_cs
44528,gra_rel,0.302578,ednta6j_cs
44529,grief,0.465727,ef9n3ld_cs


In [None]:
data_classifies_1 = pd.merge(dataset_1, df_results_1, on='id', how='left') # merge classified data with original training data
data_classifies_1.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_1["LABEL_pred_num"] = data_classifies_1["LABEL_pred"].map(label2id_1.get)
data_classifies_1.to_pickle(results_dir + "data_classified_level1.pkl") 
data_classifies_1

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad,0.888890,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,ang_ann,0.700729,8
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,love,0.965565,2
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral,1,neutral,0.974597,1
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,gra_rel,0.971078,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44526,Hmm. Today I'm going to listen to death and destruction.,edrcnmk_cs,zane_lame,weezer,24,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,grief,0.874987,15
44527,And turning someone's hands into spaghetti is cruel and inhumane and not sensible punishment for stealing.,eesyrax_cs,AgentPaper0,HadToHurt,62,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,grief,0.631932,15
44528,I'm glad you're doing a little better. I'm sorry about Grandma. Good luck.,ednta6j_cs,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,gra_rel,0.302578,3
44529,I regret this contribution so many debates and anger lol,ef9n3ld_cs,NinjaMoleRat,ComedyCemetery,40,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,grief,0.465727,15


In [None]:
test_data_1 = data_classifies_1.query(f'id in {tokenized_testing_data_1["id"]}')

In [None]:
test_data_1

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad,0.888890,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,ang_ann,0.700729,8
7,BUT IT'S HER TURN! /s,ef7tl7i,Genesis2001,SandersForPresident,17,neutral,neutral,neutral,neutral,neutral,1,neutral,0.846001,1
13,"Yup, not anymore. Keep your blood sugar up! It really helps and DRINK water...",ee64ows,atomicchuckle,Teachers,60,caring,caring,des_opt_car,des_opt_car,admiration,12,caring,0.764619,12
18,nice!! I'll try this one,edk0nru,travellingsaleslady,AskWomen,23,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration,7,pri_adm,0.742670,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44508,"My father's been dead for two years, and it's still hard not to talk about him like he's still alive.",efalo7t_cs,pinkshowerwater,depression,81,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,grief,0.921463,15
44510,6 and died of cancer irk,eczuvim_cs,howdoiusereddit1,FrankOcean,2,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,grief,0.692904,15
44511,Get this guy out of the psych ward.,eedxtl7_cs,Radical_Wasabi,greysanatomy,50,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,ang_ann,0.615256,8
44520,He died 4 days later of dehydration.,edwloev_cs,SickofInternetProgs,cringe,50,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,15,grief,0.933160,15


In [None]:
r, c = pd.DataFrame(testing_data_1).shape
print(f"The test data has {r} row and {c} columns")

The test data has 8909 row and 12 columns


In [None]:
report_1= pd.DataFrame(classification_report(test_data_1.level1, test_data_1.LABEL_pred, output_dict=True)).transpose()
report_1.to_csv(results_dir + "model_level1_report.csv")
print(report_1)
# level0 -> gold standard , LABEL_pred -> prediction

              precision    recall  f1-score      support
amusement      0.713959  0.840970  0.772277   371.000000
ang_ann        0.518908  0.489109  0.503568   505.000000
app_rea        0.471698  0.235405  0.314070   531.000000
caring         0.463768  0.457143  0.460432   140.000000
cur_con        0.532319  0.597015  0.562814   469.000000
des_opt        0.580420  0.610294  0.594982   272.000000
dis_sad        0.582960  0.433333  0.497132   300.000000
disapproval    0.445498  0.301282  0.359465   312.000000
disgust        0.710145  0.480392  0.573099   102.000000
exc_joy        0.596413  0.480144  0.532000   277.000000
fea_ner        0.631868  0.714286  0.670554   161.000000
gra_rel        0.863262  0.898799  0.880672   583.000000
grief          0.724138  0.875000  0.792453    72.000000
love           0.718016  0.862069  0.783476   319.000000
neutral        0.679543  0.717404  0.697960  3482.000000
pri_adm        0.670025  0.774381  0.718433   687.000000
rem_emb        0.656977  0.6608

In [None]:
#Final Classification/Viz

final_1 = pd.DataFrame(test_data_1['LABEL_pred'].value_counts()/test_data_1['LABEL_pred'].value_counts().sum()) # ratio
final_1.to_csv(results_dir + "model_level1_testdata_frequency.csv")
print(final_1.shape)
final_1

(18, 1)


Unnamed: 0_level_0,count
LABEL_pred,Unnamed: 1_level_1
neutral,0.412616
pri_adm,0.089123
gra_rel,0.068133
cur_con,0.059041
ang_ann,0.053429
amusement,0.049052
love,0.04299
des_opt,0.032102
app_rea,0.029745
exc_joy,0.025031


### BERT for level 2 -> 11 + 1 emotions

In [14]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_gri_example = random.sample(list(distinct_backtranslated_df.id[distinct_backtranslated_df.level2 == "dis_sad_gri"]), k=1) # example for dis_sad_gri
distinct_backtranslated_df.query('id==@dis_sad_gri_example')

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
15506,Very sad news for the man. Widespread travel disruption is expected for the evening commute,ed8cmox,legendfriend,unitedkingdom,33,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief


In [15]:
# check if data set is balanced
classCounts_2 = distinct_backtranslated_df.level2.value_counts() 
print(classCounts_2)
# -> not balanced

level2
neutral            17408
pri_adm_gra_rel     6347
sur_cur_con         3119
dis_ang_ann         3033
exc_joy_lov         2982
app_rea             2654
des_opt_car         2058
dis_sad_gri         1857
amusement           1854
disapproval         1560
rem_emb              855
fea_ner              804
Name: count, dtype: int64


In [16]:
numberOfDocuments_2 = len(distinct_backtranslated_df)
numberOfDocuments_2

44531

In [17]:
dataset_2, results_2, tokenized_testing_data_2, testing_data_2, label2id_2 = get_bert(distinct_backtranslated_df, "level2", bert, models_dir, results_dir)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Map:   0%|          | 0/35624 [00:00<?, ? examples/s]

Map:   0%|          | 0/8907 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.071,1.073981,0.656562
2,0.8563,1.056469,0.658359
3,0.6256,1.122537,0.653082


100%|██████████| 44531/44531 [09:20<00:00, 79.48it/s]


#### Evaluation

In [18]:
df_results_2 = pd.DataFrame.from_dict(results_2)
df_id_2 =  pd.DataFrame(dataset_2["id"])
df_id_2 = df_id_2.reset_index()
df_results_2["id"] = df_id_2["id"]
df_results_2

Unnamed: 0,label,score,id
0,dis_sad_gri,0.956711,eew5j0j
1,dis_ang_ann,0.713933,ed2mah1
2,exc_joy_lov,0.979985,eeibobj
3,neutral,0.969840,eda6yn6
4,pri_adm_gra_rel,0.972447,eespn2i
...,...,...,...
44526,dis_sad_gri,0.938792,edrcnmk_cs
44527,dis_ang_ann,0.379392,eesyrax_cs
44528,dis_sad_gri,0.668152,ednta6j_cs
44529,dis_sad_gri,0.544050,ef9n3ld_cs


In [19]:
data_classifies_2 = pd.merge(dataset_2, df_results_2, on='id', how='left') # merge classified data with original training data
data_classifies_2.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_2["LABEL_pred_num"] = data_classifies_2["LABEL_pred"].map(label2id_2.get)
data_classifies_2.to_pickle(results_dir + "data_classified_level2.pkl") 
data_classifies_2

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad_gri,0.956711,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,dis_ang_ann,0.713933,7
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,exc_joy_lov,0.979985,2
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral,1,neutral,0.969840,1
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,pri_adm_gra_rel,0.972447,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44526,Hmm. Today I'm going to listen to death and destruction.,edrcnmk_cs,zane_lame,weezer,24,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad_gri,0.938792,0
44527,And turning someone's hands into spaghetti is cruel and inhumane and not sensible punishment for stealing.,eesyrax_cs,AgentPaper0,HadToHurt,62,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_ang_ann,0.379392,7
44528,I'm glad you're doing a little better. I'm sorry about Grandma. Good luck.,ednta6j_cs,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad_gri,0.668152,0
44529,I regret this contribution so many debates and anger lol,ef9n3ld_cs,NinjaMoleRat,ComedyCemetery,40,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad_gri,0.544050,0


In [20]:
test_data_2 = data_classifies_2.query(f'id in {tokenized_testing_data_2["id"]}')

In [21]:
test_data_2

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,exc_joy_lov,0.979985,2
14,"Lots, play store or apple store vpn. Nord is good",edli9sc,RangoFandang,MorbidReality,51,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration,3,pri_adm_gra_rel,0.755713,3
19,"I can't stand [NAME]. Especially since her ""tatooing my own face"" video.",ed1ag17,trowellslut,BeautyGuruChatter,70,disapproval,disapproval,disapproval,dis_ang_ann_dis,loathing,4,disapproval,0.524703,4
22,By far the coolest thing I've seen on this thread yet,edm3k6w,W8nd3rW8man,popping,78,joy,exc_joy,exc_joy_lov,amu_exc_joy_lov,ecstasy,2,pri_adm_gra_rel,0.803153,3
25,If [NAME] has similar role to end year then no way am I taking those over him ...especially Sony,edo1hi2,noobiepoobie,fantasyfootball,52,neutral,neutral,neutral,neutral,neutral,1,neutral,0.577884,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44511,Get this guy out of the psych ward.,eedxtl7_cs,Radical_Wasabi,greysanatomy,50,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,neutral,0.458468,1
44517,"Oh, my God, put that away.",edj2o0u_cs,sahali735,90DayFiance,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_ang_ann,0.300499,7
44519,Civilized people should only have one answer to such behavior: violence and death.,eepty8y_cs,No1451,TopMindsOfReddit,56,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad_gri,0.941735,0
44520,He died 4 days later of dehydration.,edwloev_cs,SickofInternetProgs,cringe,50,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_sad_gri,0.945385,0


In [22]:
r, c = pd.DataFrame(testing_data_2).shape
print(f"The test data has {r} row and {c} columns")

The test data has 8907 row and 12 columns


In [23]:
report_2 = pd.DataFrame(classification_report(test_data_2.level2, test_data_2.LABEL_pred, output_dict=True)).transpose()
report_2.to_csv(results_dir + "model_level2_report.csv")
print(report_2)
# level2 -> gold standard , LABEL_pred -> prediction

                 precision    recall  f1-score      support
amusement         0.678723  0.859838  0.758621   371.000000
app_rea           0.492248  0.239171  0.321926   531.000000
des_opt_car       0.558696  0.623786  0.589450   412.000000
dis_ang_ann       0.495924  0.601318  0.543559   607.000000
dis_sad_gri       0.592697  0.568733  0.580468   371.000000
disapproval       0.419355  0.416667  0.418006   312.000000
exc_joy_lov       0.625526  0.748322  0.681436   596.000000
fea_ner           0.663043  0.757764  0.707246   161.000000
neutral           0.709893  0.669730  0.689227  3482.000000
pri_adm_gra_rel   0.806651  0.821907  0.814208  1269.000000
rem_emb           0.660920  0.672515  0.666667   171.000000
sur_cur_con       0.594311  0.636218  0.614551   624.000000
accuracy          0.658359  0.658359  0.658359     0.658359
macro avg         0.608166  0.634664  0.615447  8907.000000
weighted avg      0.657241  0.658359  0.653593  8907.000000


In [24]:
#Final Classification/Viz

final_2 = pd.DataFrame(test_data_2['LABEL_pred'].value_counts()/test_data_2['LABEL_pred'].value_counts().sum()) # ratio
final_2.to_csv(results_dir + "model_level2_testdata_frequency.csv")
print(final_2.shape)
final_2

(12, 1)


Unnamed: 0_level_0,count
LABEL_pred,Unnamed: 1_level_1
neutral,0.368811
pri_adm_gra_rel,0.145167
dis_ang_ann,0.082632
exc_joy_lov,0.080049
sur_cur_con,0.074997
amusement,0.052767
des_opt_car,0.051645
dis_sad_gri,0.039969
disapproval,0.034804
app_rea,0.028966


### BERT for level 3 -> 7 + 1 emotions

In [19]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

rem_emb_dis_sad_gri_example = random.sample(list(distinct_backtranslated_df.id[distinct_backtranslated_df.level3 == "rem_emb_dis_sad_gri"]), k=1) # example for rem_emb_dis_sad_gri
distinct_backtranslated_df.query('id==@rem_emb_dis_sad_gri_example')

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik
40786,And I've no monkeys left.... Sorry man,efdctt3,FlacidButPlacid,ChoosingBeggars,34,disappointment,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,disapproval


In [20]:
# check if data set is balanced
classCounts_3 = distinct_backtranslated_df.level3.value_counts() 
print(classCounts_3)
# -> not balanced

level3
neutral                    17408
pri_adm_gra_rel_app_rea     9001
amu_exc_joy_lov             4836
dis_ang_ann_dis             4593
sur_cur_con                 3119
rem_emb_dis_sad_gri         2712
des_opt_car                 2058
fea_ner                      804
Name: count, dtype: int64


In [21]:
numberOfDocuments_3 = len(distinct_backtranslated_df)
numberOfDocuments_3

44531

In [22]:
dataset_3, results_3, tokenized_testing_data_3, testing_data_3, label2id_3 = get_bert(distinct_backtranslated_df, "level3", bert, models_dir, results_dir)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Map:   0%|          | 0/35624 [00:00<?, ? examples/s]

Map:   0%|          | 0/8907 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9495,0.898614,0.681262
2,0.7337,0.912634,0.686426
3,0.5097,1.014398,0.680701


100%|██████████| 44531/44531 [08:41<00:00, 85.36it/s]


#### Evaluation

In [23]:
df_results_3 = pd.DataFrame.from_dict(results_3)
df_id_3 =  pd.DataFrame(dataset_3["id"])
df_id_3 = df_id_3.reset_index()
df_results_3["id"] = df_id_3["id"]
df_results_3

Unnamed: 0,label,score,id
0,rem_emb_dis_sad_gri,0.907100,eew5j0j
1,dis_ang_ann_dis,0.456828,ed2mah1
2,amu_exc_joy_lov,0.922782,eeibobj
3,neutral,0.928002,eda6yn6
4,pri_adm_gra_rel_app_rea,0.958941,eespn2i
...,...,...,...
44526,rem_emb_dis_sad_gri,0.859186,edrcnmk_cs
44527,dis_ang_ann_dis,0.464114,eesyrax_cs
44528,rem_emb_dis_sad_gri,0.336438,ednta6j_cs
44529,rem_emb_dis_sad_gri,0.533425,ef9n3ld_cs


In [24]:
data_classifies_3 = pd.merge(dataset_3, df_results_3, on='id', how='left') # merge classified data with original training data
data_classifies_3.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_3["LABEL_pred_num"] = data_classifies_3["LABEL_pred"].map(label2id_3.get)
data_classifies_3.to_pickle(results_dir + "data_classified_level3.pkl") 
data_classifies_3

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.907100,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,dis_ang_ann_dis,0.456828,4
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,amu_exc_joy_lov,0.922782,2
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral,1,neutral,0.928002,1
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,pri_adm_gra_rel_app_rea,0.958941,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44526,Hmm. Today I'm going to listen to death and destruction.,edrcnmk_cs,zane_lame,weezer,24,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.859186,0
44527,And turning someone's hands into spaghetti is cruel and inhumane and not sensible punishment for stealing.,eesyrax_cs,AgentPaper0,HadToHurt,62,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,dis_ang_ann_dis,0.464114,4
44528,I'm glad you're doing a little better. I'm sorry about Grandma. Good luck.,ednta6j_cs,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.336438,0
44529,I regret this contribution so many debates and anger lol,ef9n3ld_cs,NinjaMoleRat,ComedyCemetery,40,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.533425,0


In [25]:
test_data_3 = data_classifies_3.query(f'id in {tokenized_testing_data_3["id"]}')

In [26]:
test_data_3

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
8,Build a wall? /jk,edsqvyx,[deleted],AskMenOver30,10,neutral,neutral,neutral,neutral,neutral,1,neutral,0.588105,1
13,"Yup, not anymore. Keep your blood sugar up! It really helps and DRINK water...",ee64ows,atomicchuckle,Teachers,60,caring,caring,des_opt_car,des_opt_car,admiration,6,pri_adm_gra_rel_app_rea,0.441573,3
29,"There has to be some repercussions for this judge, right? Considering the crimes, that bail seems incredibly low...",effsj1p,momlostinfuzz,nottheonion,74,disapproval,disapproval,disapproval,dis_ang_ann_dis,loathing,4,neutral,0.599494,1
40,I feel this. For me since I’m using those gamethreads to let out frustration but realistically I know we need to keep patient in this re-rebuild,edlsovm,Shams_wojnarowski,OrlandoMagic,26,neutral,neutral,neutral,neutral,neutral,1,neutral,0.390917,1
42,Best side quest ever!,eevt2yy,Brrringsaythealiens,truegaming,61,admiration,pri_adm,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,admiration,3,pri_adm_gra_rel_app_rea,0.915054,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44513,My thoughts look just like the dead.,eestrba_cs,Marouli29,StarWarsBattlefront,35,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,neutral,0.457106,1
44517,"Oh, my God, put that away.",edj2o0u_cs,sahali735,90DayFiance,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.591108,0
44524,"I'm really sorry for your loss, my friend, I hope everything's all right on your side.",edcpzgq_cs,SwampBollocks,mentalhealth,3,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.852528,0
44525,I'm sorry your wife suffered a tragic injury.,ee23ppc_cs,bideaweebaby,DeadBedrooms,15,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,rem_emb_dis_sad_gri,0.886416,0


In [27]:
r, c = pd.DataFrame(testing_data_3).shape
print(f"The test data has {r} row and {c} columns")

The test data has 8907 row and 12 columns


In [28]:
report_3 = pd.DataFrame(classification_report(test_data_3.level3, test_data_3.LABEL_pred, output_dict=True)).transpose()
report_3.to_csv(results_dir + "model_level3_report.csv")
print(report_3)
# level3 -> gold standard , LABEL_pred -> prediction

                         precision    recall  f1-score      support
amu_exc_joy_lov           0.787419  0.750776  0.768661   967.000000
des_opt_car               0.650641  0.492718  0.560773   412.000000
dis_ang_ann_dis           0.618182  0.406964  0.490814   919.000000
fea_ner                   0.662252  0.621118  0.641026   161.000000
neutral                   0.660063  0.784607  0.716966  3482.000000
pri_adm_gra_rel_app_rea   0.714442  0.725556  0.719956  1800.000000
rem_emb_dis_sad_gri       0.671429  0.607011  0.637597   542.000000
sur_cur_con               0.647826  0.477564  0.549815   624.000000
accuracy                  0.681262  0.681262  0.681262     0.681262
macro avg                 0.676532  0.608289  0.635701  8907.000000
weighted avg              0.679996  0.681262  0.674712  8907.000000


In [29]:
#Final Classification/Viz

final_3 = pd.DataFrame(test_data_3['LABEL_pred'].value_counts()/test_data_3['LABEL_pred'].value_counts().sum()) # ratio
final_3.to_csv(results_dir + "model_level3_testdata_frequency.csv")
print(final_3.shape)
final_3

(8, 1)


Unnamed: 0_level_0,count
LABEL_pred,Unnamed: 1_level_1
neutral,0.464691
pri_adm_gra_rel_app_rea,0.205232
amu_exc_joy_lov,0.103514
dis_ang_ann_dis,0.067924
rem_emb_dis_sad_gri,0.055013
sur_cur_con,0.051645
des_opt_car,0.035029
fea_ner,0.016953


### BERT for plutchik -> 14 + 1 emotions

In [30]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

grief_example = random.sample(list(distinct_backtranslated_df.id[distinct_backtranslated_df.plutchik == "grief"]), k=1) # example for grief
distinct_backtranslated_df.query('id==@grief_example')

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label
11816,"I play those characters all of the time and have never, ever experienced this.",eepkge5,ThStyleofDemons,StarWarsBattlefront,56,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0


In [31]:
# check if data set is balanced
classCounts_p = distinct_backtranslated_df.plutchik.value_counts() 
print(classCounts_p)
# -> not balanced

plutchik
neutral           17408
admiration         5838
ecstasy            3240
awe                2982
rage               2523
loathing           2070
love               2016
vigilance          2009
amazement          1712
optimism           1321
grief              1186
terror              804
disapproval         671
remorse             405
aggressiveness      346
Name: count, dtype: int64


In [32]:
numberOfDocuments_p = len(distinct_backtranslated_df)
numberOfDocuments_p

44531

In [33]:
dataset_p, results_p, tokenized_testing_data_p, testing_data_p, label2id_p = get_bert(distinct_backtranslated_df, "plutchik", bert, models_dir, results_dir)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Map:   0%|          | 0/35625 [00:00<?, ? examples/s]

Map:   0%|          | 0/8906 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1675,1.115874,0.645969
2,0.9406,1.076242,0.660341
3,0.6989,1.136521,0.652706


100%|██████████| 44531/44531 [08:38<00:00, 85.85it/s]


#### Evaluation

In [34]:
df_results_p = pd.DataFrame.from_dict(results_p)
df_id_p =  pd.DataFrame(dataset_p["id"])
df_id_p = df_id_p.reset_index()
df_results_p["id"] = df_id_p["id"]
df_results_p

Unnamed: 0,label,score,id
0,grief,0.868497,eew5j0j
1,rage,0.700160,ed2mah1
2,love,0.970830,eeibobj
3,neutral,0.957046,eda6yn6
4,awe,0.982999,eespn2i
...,...,...,...
44526,grief,0.911037,edrcnmk_cs
44527,grief,0.790627,eesyrax_cs
44528,grief,0.443117,ednta6j_cs
44529,grief,0.599169,ef9n3ld_cs


In [35]:
data_classifies_p = pd.merge(dataset_p, df_results_p, on='id', how='left') # merge classified data with original training data
data_classifies_p.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_p["LABEL_pred_num"] = data_classifies_p["LABEL_pred"].map(label2id_p.get)
data_classifies_p.to_pickle(results_dir + "data_classified_plutchik.pkl") 
data_classifies_p

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
0,That game hurt.,eew5j0j,Brdd9,nrl,1,sadness,dis_sad,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.868497,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,rage,0.700160,9
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,18,love,love,exc_joy_lov,amu_exc_joy_lov,love,2,love,0.970830,2
3,"[NAME] was nowhere near them, he was by the Falcon.",eda6yn6,American_Fascist713,starwarsspeculation,2,neutral,neutral,neutral,neutral,neutral,1,neutral,0.957046,1
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,awe,0.982999,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44526,Hmm. Today I'm going to listen to death and destruction.,edrcnmk_cs,zane_lame,weezer,24,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.911037,0
44527,And turning someone's hands into spaghetti is cruel and inhumane and not sensible punishment for stealing.,eesyrax_cs,AgentPaper0,HadToHurt,62,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.790627,0
44528,I'm glad you're doing a little better. I'm sorry about Grandma. Good luck.,ednta6j_cs,partytimetyler,AskMenOver30,12,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.443117,0
44529,I regret this contribution so many debates and anger lol,ef9n3ld_cs,NinjaMoleRat,ComedyCemetery,40,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.599169,0


In [36]:
test_data_p = data_classifies_p.query(f'id in {tokenized_testing_data_p["id"]}')

In [37]:
test_data_p

Unnamed: 0,text,id,author,subreddit,rater_id,level0,level1,level2,level3,plutchik,label_x,LABEL_pred,score,LABEL_pred_num
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,37,neutral,neutral,neutral,neutral,neutral,1,rage,0.700160,9
4,"Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help!",eespn2i,ImperialBoss,TrueReddit,61,gratitude,gra_rel,pri_adm_gra_rel,pri_adm_gra_rel_app_rea,awe,3,awe,0.982999,3
10,this definitely fits in r/BoneAppleTea.,ee9jw7b,sparkthelynx,KidsAreFuckingStupid,67,neutral,neutral,neutral,neutral,neutral,1,admiration,0.653405,8
19,"I can't stand [NAME]. Especially since her ""tatooing my own face"" video.",ed1ag17,trowellslut,BeautyGuruChatter,70,disapproval,disapproval,disapproval,dis_ang_ann_dis,loathing,4,neutral,0.472601,1
21,"Pity. I had some decent lunches there, but never went there at night.",ee04wu6,The_Inflicted,nashville,16,remorse,rem_emb,rem_emb,rem_emb_dis_sad_gri,remorse,12,disapproval,0.300584,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44503,"The experiment was successful, it cries and doesn't talk to them because they're dead.",eddtnpk_cs,MayaLou09,TwoSentenceHorror,1,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.829756,0
44507,"Hell, I've played this game almost every day for over two years, and I loved the regime of domination.",ee6360j_cs,Tesseract14,gifs,52,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,love,0.623781,2
44515,That pathetic compassion.,eep7dfq_cs,ForenzaAsmr,Overwatch,1,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.823062,0
44521,"You were a nice girl, it's been over 10 years, but I still miss you.",edjhszz_cs,ClearanceItem,FunnyandSad,74,grief,grief,dis_sad_gri,rem_emb_dis_sad_gri,grief,0,grief,0.859303,0


In [38]:
r, c = pd.DataFrame(testing_data_p).shape
print(f"The test data has {r} row and {c} columns")

The test data has 8906 row and 12 columns


In [39]:
report_p = pd.DataFrame(classification_report(test_data_p.plutchik, test_data_p.LABEL_pred, output_dict=True)).transpose()
report_p.to_csv(results_dir + "model_plutchik_report.csv")
print(report_p)
# plutchik -> gold standard , LABEL_pred -> prediction

                precision    recall  f1-score      support
admiration       0.617021  0.620719  0.618865  1168.000000
aggressiveness   0.850000  0.739130  0.790698    69.000000
amazement        0.614754  0.438596  0.511945   342.000000
awe              0.843960  0.843960  0.843960   596.000000
disapproval      0.373333  0.208955  0.267943   134.000000
ecstasy          0.671105  0.777778  0.720515   648.000000
grief            0.605166  0.691983  0.645669   237.000000
loathing         0.514493  0.342995  0.411594   414.000000
love             0.676768  0.831266  0.746102   403.000000
neutral          0.698474  0.723148  0.710597  3482.000000
optimism         0.584906  0.587121  0.586011   264.000000
rage             0.569161  0.497030  0.530655   505.000000
remorse          0.566372  0.790123  0.659794    81.000000
terror           0.708861  0.695652  0.702194   161.000000
vigilance        0.469816  0.445274  0.457216   402.000000
accuracy         0.660341  0.660341  0.660341     0.6603

In [40]:
#Final Classification/Viz

final_p = pd.DataFrame(test_data_p['LABEL_pred'].value_counts()/test_data_p['LABEL_pred'].value_counts().sum()) # ratio
final_p.to_csv(results_dir + "model_plutchik_testdata_frequency.csv")
print(final_p.shape)
final_p

(15, 1)


Unnamed: 0_level_0,count
LABEL_pred,Unnamed: 1_level_1
neutral,0.404783
admiration,0.131934
ecstasy,0.084325
awe,0.066921
love,0.055581
rage,0.049517
vigilance,0.04278
loathing,0.03099
grief,0.030429
optimism,0.029755
