# Results

In this notebook we show the results of our experiments for both tasks

In [191]:
%load_ext autoreload
%autoreload 2

base_path = "../evaluations/es"
!ls $base_path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
bertin.json	   birnn_cc.json       ffn_twitter.json    rnn_twitter.json
beto_cased.json    birnn_twitter.json  mbert_uncased.json  roberta.json
beto_uncased.json  ffn_cc.json	       rnn_cc.json	   robertuito.json


In [192]:
import os
import glob
import json

def clean_key(k):
    return k.split("_", 1)[1]


evaluation_paths = glob.glob(f"{base_path}/*.json")

models = {}

for path in evaluation_paths:
    print(path)
    name = os.path.basename(path).split(".")[0]
    with open(path) as f:
        model_evaluation = json.load(f)
        clean_evaluations = []
        for task in model_evaluation["evaluations"].keys():
            task_evaluations = model_evaluation["evaluations"][task]
            clean_evaluations = [
                {clean_key(k): v for k, v in ev.items()} 
                for ev in task_evaluations
            ]

            model_evaluation["evaluations"][task] = clean_evaluations
        models[name] = model_evaluation
        
models.keys()

../evaluations/es/birnn_twitter.json
../evaluations/es/ffn_twitter.json
../evaluations/es/roberta.json
../evaluations/es/beto_uncased.json
../evaluations/es/mbert_uncased.json
../evaluations/es/birnn_cc.json
../evaluations/es/robertuito.json
../evaluations/es/bertin.json
../evaluations/es/rnn_twitter.json
../evaluations/es/beto_cased.json
../evaluations/es/rnn_cc.json
../evaluations/es/ffn_cc.json


dict_keys(['birnn_twitter', 'ffn_twitter', 'roberta', 'beto_uncased', 'mbert_uncased', 'birnn_cc', 'robertuito', 'bertin', 'rnn_twitter', 'beto_cased', 'rnn_cc', 'ffn_cc'])

In [194]:
metrics = {
    "hate_speech": "macro_f1",
    "sentiment": "macro_f1",
    "emotion": "macro_f1",
    "irony": "macro_f1",
}

results = []

for model, model_evaluation in models.items():
    print("="*50)
    print(model)
    for task, task_evaluations in model_evaluation["evaluations"].items():
        print(task)
        print(len(task_evaluations))
        for evaluation in task_evaluations:
            metric = metrics[task]

            ## TODO
            if metric not in evaluation:
                metric = "hateful_f1"
            results.append({
                "model": model,
                "task": task,
                "metric": evaluation[metric],
            })

birnn_twitter
hate_speech
10
sentiment
10
emotion
10
irony
10
ffn_twitter
hate_speech
10
sentiment
10
emotion
10
irony
10
roberta
hate_speech
10
sentiment
10
emotion
10
irony
10
beto_uncased
hate_speech
10
sentiment
10
emotion
10
irony
10
mbert_uncased
hate_speech
10
sentiment
10
emotion
10
irony
10
birnn_cc
hate_speech
10
sentiment
10
emotion
10
irony
10
robertuito
hate_speech
10
sentiment
10
emotion
10
irony
10
bertin
hate_speech
10
sentiment
10
emotion
10
irony
9
rnn_twitter
hate_speech
10
sentiment
10
emotion
10
irony
10
beto_cased
hate_speech
10
sentiment
10
emotion
10
irony
10
rnn_cc
hate_speech
10
sentiment
10
emotion
10
irony
10
ffn_cc
hate_speech
10
sentiment
10
emotion
10
irony
10


In [195]:
import pandas as pd

pd.options.display.float_format = '{:.3f}'.format

mean_df = pd.DataFrame(results).groupby(["model", "task"]).mean().stack()
std_df = pd.DataFrame(results).groupby(["model", "task"]).mean().stack()
# Magia negra
mean_df.index = mean_df.index.droplevel(-1)
std_df.index = std_df.index.droplevel(-1)

mean_df = mean_df.unstack(1)
std_df = std_df.unstack(1)

mean_df["score"] = mean_df.mean(1)

mean_df.sort_values("score", ascending=False)


task,emotion,hate_speech,irony,sentiment,score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
robertuito,0.56,0.759,0.739,0.705,0.691
roberta,0.527,0.741,0.721,0.67,0.665
bertin,0.524,0.738,0.713,0.666,0.66
beto_uncased,0.532,0.727,0.701,0.651,0.653
beto_cased,0.516,0.724,0.705,0.662,0.652
mbert_uncased,0.493,0.718,0.681,0.617,0.627
birnn_twitter,0.264,0.592,0.631,0.585,0.518
rnn_twitter,0.269,0.538,0.628,0.602,0.509
birnn_cc,0.231,0.534,0.625,0.553,0.486
rnn_cc,0.237,0.516,0.581,0.564,0.474


In [198]:
mean_df.to_string(

[0;31mSignature:[0m
[0mmean_df[0m[0;34m.[0m[0mto_string[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbuf[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m,[0m [0mIO[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_space[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mheader[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m

In [76]:
data = []



for model_name, model_info in models.items():
    print("===========")
    print(f"{model_name}")
    for task, task_runs in model_info["evaluations"].items():
        print(len(task_runs))    

ffn_twitter
10
10
10
10
mbert_uncased
1
1
0
0
birnn_cc
5
4
4
4
rnn_twitter
10
10
10
9
beto_cased
4
4
3
3
rnn_cc
10
10
10
10
ffn_cc
10
10
10
10


In [10]:

import glob 
import json 

tasks = ["sentiment", "emotion"]

files = [(task, f) for task in tasks for f in glob.glob(f"../evaluations/{task}_*.json")]

evaluations = []

for task, file in files:
    print(file)
    with open(file) as f:
        evaluation = json.load(f)
        evaluation["task"] = task
        evaluation["file"] = file.split("/")[-1]
        evaluations.append(evaluation)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
../evaluations/sentiment_beto.json
../evaluations/sentiment_roberta_base.json
../evaluations/sentiment_mbert_en.json
../evaluations/sentiment_bert_base.json
../evaluations/sentiment_distilbert_es.json
../evaluations/sentiment_bertweet_base.json
../evaluations/sentiment_mbert_es.json
../evaluations/sentiment_distilbert_en.json
../evaluations/emotion_bertweet_base.json
../evaluations/emotion_beto.json
../evaluations/emotion_mbert_es.json
../evaluations/emotion_mbert_en.json
../evaluations/emotion_roberta.json
../evaluations/emotion_bert_base.json
../evaluations/emotion_distilbert_en.json
../evaluations/emotion_distilbert_es.json


In [11]:
import pandas as pd 


dfs = {}

for task in tasks:
    df =  pd.DataFrame([
        {**evaluation, **evaluation["metrics"]} for evaluation in evaluations if evaluation["task"] == task
    ])

    df.drop(labels=["predictions", "labels", "metrics", "file"], inplace=True, axis=1)
    df["model"] = df["model"].str.replace("models/", "")

    df["model"] = df["model"].str.replace(f"-{task}-analysis/", "")
    df.columns = [col.replace("test_", "").replace("_", " ").capitalize() for col in df.columns]
    #df.set_index("Model", inplace=True)
    df = df.sort_values(["Lang", "Macro f1"]) 
    dfs[task] = df

df = dfs["sentiment"].merge(dfs["emotion"], on="Model", suffixes=("", "_emotion"))

In [12]:
df.columns

Index(['Lang', 'Model', 'Task', 'Loss', 'Neg f1', 'Neg precision',
       'Neg recall', 'Neu f1', 'Neu precision', 'Neu recall', 'Pos f1',
       'Pos precision', 'Pos recall', 'Micro f1', 'Macro f1',
       'Macro precision', 'Macro recall', 'Acc', 'Runtime',
       'Samples per second', 'Lang_emotion', 'Task_emotion', 'Loss_emotion',
       'Others f1', 'Others precision', 'Others recall', 'Joy f1',
       'Joy precision', 'Joy recall', 'Sadness f1', 'Sadness precision',
       'Sadness recall', 'Anger f1', 'Anger precision', 'Anger recall',
       'Surprise f1', 'Surprise precision', 'Surprise recall', 'Disgust f1',
       'Disgust precision', 'Disgust recall', 'Fear f1', 'Fear precision',
       'Fear recall', 'Micro f1_emotion', 'Macro f1_emotion',
       'Macro precision_emotion', 'Macro recall_emotion', 'Acc_emotion',
       'Runtime_emotion', 'Samples per second_emotion'],
      dtype='object')

In [20]:
base_columns = ["Model", "Lang"]

df.sort_values(["Lang", "Macro f1"], inplace=True) 
sentiment_columns = [
    #"Pos f1", 
    #"Neu f1", 
    #"Neg f1", 
    "Micro f1",
    "Macro f1"
]
emotion_columns = [
    #"Joy f1",
    #"Others f1",
    #"Sadness f1",
    #"Anger f1",
    #"Disgust f1",
    "Micro f1_emotion",
    "Macro f1_emotion"
]

print(df[base_columns + sentiment_columns + emotion_columns].to_latex(index=False, float_format="{0:.3f}".format, col_space=12))

\begin{tabular}{llrrrr}
\toprule
        Model &         Lang &     Micro f1 &     Macro f1 &  Micro f1\_emotion &  Macro f1\_emotion \\
\midrule
distilbert-en &           en &        0.649 &        0.642 &             0.503 &             0.383 \\
     mbert-en &           en &        0.645 &        0.643 &             0.516 &             0.394 \\
 roberta-base &           en &        0.686 &        0.684 &             0.563 &             0.445 \\
    bert-base &           en &        0.686 &        0.684 &             0.559 &             0.439 \\
bertweet-base &           en &        0.697 &        0.696 &             0.584 &             0.476 \\
distilbert-es &           es &        0.602 &        0.599 &             0.600 &             0.463 \\
     mbert-es &           es &        0.609 &        0.604 &             0.610 &             0.474 \\
         beto &           es &        0.672 &        0.667 &             0.688 &             0.548 \\
\bottomrule
\end{tabular}



In [4]:
f1_columns = [col for col in df.columns if "f1" in col and "Macro" not in col]
print(df[["Model", "Lang"] + f1_columns + ["Macro f1"]].to_markdown(index=False))

| Model         | Lang   |   Others f1 |   Joy f1 |   Sadness f1 |   Anger f1 |   Surprise f1 |   Disgust f1 |   Fear f1 |   Macro f1 |
|:--------------|:-------|------------:|---------:|-------------:|-----------:|--------------:|-------------:|----------:|-----------:|
| distilbert-en | en     |    0.509407 | 0.665484 |     0.502165 |   0.30303  |      0.147541 |     0.351111 |  0.201835 |   0.382939 |
| mbert-en      | en     |    0.547718 | 0.650691 |     0.544643 |   0.29703  |      0.102941 |     0.356989 |  0.255319 |   0.393619 |
| bert-base     | en     |    0.59398  | 0.683521 |     0.541485 |   0.355769 |      0.238411 |     0.408867 |  0.252632 |   0.439238 |
| roberta-base  | en     |    0.573888 | 0.689527 |     0.547945 |   0.363636 |      0.212389 |     0.472656 |  0.255814 |   0.445122 |
| bertweet-base | en     |    0.606019 | 0.711069 |     0.608696 |   0.433862 |      0.257669 |     0.452489 |  0.26     |   0.475686 |
| distilbert-es | es     |    0.678962 | 0.60223