# Results

In this notebook we show the results of our experiments for both tasks

In [1]:
%load_ext autoreload
%autoreload 2

base_path = "../evaluations/es"
!ls $base_path

bertin.json	   birnn_twitter.json  mbert_uncased.json  robertuito.json
beto_cased.json    electricidad.json   rnn_cc.json
beto_uncased.json  ffn_cc.json	       rnn_twitter.json
birnn_cc.json	   ffn_twitter.json    roberta.json


In [2]:
import os
import glob
import json

def clean_key(k):
    return k.split("_", 1)[1]


evaluation_paths = glob.glob(f"{base_path}/*.json")

models = {}

for path in evaluation_paths:
    print(path)
    name = os.path.basename(path).split(".")[0]
    with open(path) as f:
        model_evaluation = json.load(f)
        clean_evaluations = []
        for task in model_evaluation["evaluations"].keys():
            task_evaluations = model_evaluation["evaluations"][task]
            clean_evaluations = [
                {clean_key(k): v for k, v in ev.items()} 
                for ev in task_evaluations
            ]

            model_evaluation["evaluations"][task] = clean_evaluations
        models[name] = model_evaluation
        
models.keys()

../evaluations/es/roberta.json
../evaluations/es/birnn_twitter.json
../evaluations/es/mbert_uncased.json
../evaluations/es/rnn_twitter.json
../evaluations/es/rnn_cc.json
../evaluations/es/beto_cased.json
../evaluations/es/robertuito.json
../evaluations/es/ffn_cc.json
../evaluations/es/bertin.json
../evaluations/es/ffn_twitter.json
../evaluations/es/birnn_cc.json
../evaluations/es/electricidad.json
../evaluations/es/beto_uncased.json


dict_keys(['roberta', 'birnn_twitter', 'mbert_uncased', 'rnn_twitter', 'rnn_cc', 'beto_cased', 'robertuito', 'ffn_cc', 'bertin', 'ffn_twitter', 'birnn_cc', 'electricidad', 'beto_uncased'])

In [6]:
metrics = {
    "hate_speech": "macro_f1",
    "sentiment": "macro_f1",
    "emotion": "macro_f1",
    "irony": "macro_f1",
    "ner": "micro_f1",
    "pos": "accuracy",
    "lince_sentiment": "macro_f1"
}

results = []

for model, model_evaluation in models.items():
    print("="*50)
    print(model)
    for task, task_evaluations in model_evaluation["evaluations"].items():
        print(task)
        print(len(task_evaluations))
        for evaluation in task_evaluations:
            metric = metrics[task]

            ## TODO
            if metric not in evaluation:
                metric = "hateful_f1"
            results.append({
                "model": model,
                "task": task,
                "metric": evaluation[metric],
            })

roberta
hate_speech
10
sentiment
10
emotion
10
irony
10
birnn_twitter
hate_speech
10
sentiment
10
emotion
10
irony
10
mbert_uncased
hate_speech
10
sentiment
10
emotion
10
irony
10
rnn_twitter
hate_speech
10
sentiment
10
emotion
10
irony
10
rnn_cc
hate_speech
10
sentiment
10
emotion
10
irony
10
beto_cased
hate_speech
10
sentiment
10
emotion
10
irony
10
robertuito
hate_speech
10
sentiment
10
emotion
10
irony
10
ffn_cc
hate_speech
10
sentiment
10
emotion
10
irony
10
bertin
hate_speech
10
sentiment
10
emotion
10
irony
9
ffn_twitter
hate_speech
10
sentiment
10
emotion
10
irony
10
birnn_cc
hate_speech
10
sentiment
10
emotion
10
irony
10
electricidad
hate_speech
10
sentiment
10
emotion
10
irony
10
ner
10
pos
10
lince_sentiment
10
beto_uncased
hate_speech
10
sentiment
10
emotion
10
irony
10


In [7]:
import pandas as pd

pd.options.display.float_format = '{:.3f}'.format

mean_df = pd.DataFrame(results).groupby(["model", "task"]).mean().stack()
std_df = pd.DataFrame(results).groupby(["model", "task"]).std().stack()
# Magia negra
mean_df.index = mean_df.index.droplevel(-1)
std_df.index = std_df.index.droplevel(-1)

mean_df = mean_df.unstack(1)
std_df = std_df.unstack(1)

mean_df["score"] = mean_df.mean(1)

mean_df.sort_values("score", ascending=False)


task,emotion,hate_speech,irony,lince_sentiment,ner,pos,sentiment,score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
robertuito,0.56,0.759,0.739,,,,0.705,0.691
roberta,0.527,0.741,0.721,,,,0.67,0.665
bertin,0.524,0.738,0.713,,,,0.666,0.66
beto_uncased,0.532,0.727,0.701,,,,0.651,0.653
beto_cased,0.516,0.724,0.705,,,,0.662,0.652
electricidad,0.455,0.722,0.68,0.508,0.569,0.965,0.655,0.651
mbert_uncased,0.493,0.718,0.681,,,,0.617,0.627
birnn_twitter,0.264,0.592,0.631,,,,0.585,0.518
rnn_twitter,0.269,0.538,0.628,,,,0.602,0.509
birnn_cc,0.231,0.534,0.625,,,,0.553,0.486


In [8]:

result_df = pd.DataFrame()

for col in mean_df:
    if col == "score":
        continue
    result_df[col] = mean_df[col].apply(lambda x: f"{x:.3f}") + " ± " + std_df[col].apply(lambda x: f"{x:.3f}")

result_df["score"] = mean_df["score"]

result_df = result_df.sort_values("score", ascending=False)

result_df

Unnamed: 0_level_0,emotion,hate_speech,irony,lince_sentiment,ner,pos,sentiment,score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
robertuito,0.560 ± 0.010,0.759 ± 0.007,0.739 ± 0.005,nan ± nan,nan ± nan,nan ± nan,0.705 ± 0.003,0.691
roberta,0.527 ± 0.015,0.741 ± 0.012,0.721 ± 0.008,nan ± nan,nan ± nan,nan ± nan,0.670 ± 0.006,0.665
bertin,0.524 ± 0.007,0.738 ± 0.007,0.713 ± 0.012,nan ± nan,nan ± nan,nan ± nan,0.666 ± 0.005,0.66
beto_uncased,0.532 ± 0.012,0.727 ± 0.016,0.701 ± 0.007,nan ± nan,nan ± nan,nan ± nan,0.651 ± 0.006,0.653
beto_cased,0.516 ± 0.012,0.724 ± 0.012,0.705 ± 0.009,nan ± nan,nan ± nan,nan ± nan,0.662 ± 0.005,0.652
electricidad,0.455 ± 0.030,0.722 ± 0.009,0.680 ± 0.014,0.508 ± 0.009,0.569 ± 0.006,0.965 ± 0.000,0.655 ± 0.004,0.651
mbert_uncased,0.493 ± 0.010,0.718 ± 0.011,0.681 ± 0.010,nan ± nan,nan ± nan,nan ± nan,0.617 ± 0.003,0.627
birnn_twitter,0.264 ± 0.007,0.592 ± 0.018,0.631 ± 0.011,nan ± nan,nan ± nan,nan ± nan,0.585 ± 0.011,0.518
rnn_twitter,0.269 ± 0.003,0.538 ± 0.014,0.628 ± 0.014,nan ± nan,nan ± nan,nan ± nan,0.602 ± 0.004,0.509
birnn_cc,0.231 ± 0.006,0.534 ± 0.022,0.625 ± 0.009,nan ± nan,nan ± nan,nan ± nan,0.553 ± 0.008,0.486


In [8]:


print(result_df.to_markdown())

| model         | emotion       | hate_speech   | irony         | sentiment     |    score |
|:--------------|:--------------|:--------------|:--------------|:--------------|---------:|
| robertuito    | 0.560 ± 0.010 | 0.759 ± 0.007 | 0.739 ± 0.005 | 0.705 ± 0.003 | 0.690734 |
| roberta       | 0.527 ± 0.015 | 0.741 ± 0.012 | 0.721 ± 0.008 | 0.670 ± 0.006 | 0.664632 |
| bertin        | 0.524 ± 0.007 | 0.738 ± 0.007 | 0.713 ± 0.012 | 0.666 ± 0.005 | 0.660141 |
| beto_uncased  | 0.532 ± 0.012 | 0.727 ± 0.016 | 0.701 ± 0.007 | 0.651 ± 0.006 | 0.652608 |
| beto_cased    | 0.516 ± 0.012 | 0.724 ± 0.012 | 0.705 ± 0.009 | 0.662 ± 0.005 | 0.651739 |
| mbert_uncased | 0.493 ± 0.010 | 0.718 ± 0.011 | 0.681 ± 0.010 | 0.617 ± 0.003 | 0.627368 |
| birnn_twitter | 0.264 ± 0.007 | 0.592 ± 0.018 | 0.631 ± 0.011 | 0.585 ± 0.011 | 0.518025 |
| rnn_twitter   | 0.269 ± 0.003 | 0.538 ± 0.014 | 0.628 ± 0.014 | 0.602 ± 0.004 | 0.509427 |
| birnn_cc      | 0.231 ± 0.006 | 0.534 ± 0.022 | 0.625 ± 0.009 | 0.55

In [266]:
import scipy.stats

df = pd.DataFrame(results)
pairs = [
    ('birnn_twitter', 'birnn_cc'),
    ('rnn_twitter', 'rnn_cc'),
    ('ffn_twitter', 'ffn_cc'),
]

tasks = ["irony", "emotion", "sentiment"]

pvals = []

for twitter_model, cc_model in pairs:
    for task in tasks:
        tw_scores = df.loc[(df["model"] == twitter_model) & (df["task"] == task), "metric"]
        cc_scores = df.loc[(df["model"] == cc_model) & (df["task"] == task), "metric"]

        pvals.append({
            "model": twitter_model,
            "task": task,
            "pval": scipy.stats.mannwhitneyu(tw_scores, cc_scores, alternative="greater").pvalue,
        }) 

In [270]:
from statsmodels.stats.multitest import multipletests

df_pvals = pd.DataFrame(pvals)
df_pvals

Unnamed: 0,model,task,pval
0,birnn_twitter,irony,0.038
1,birnn_twitter,emotion,0.0
2,birnn_twitter,sentiment,0.0
3,rnn_twitter,irony,0.0
4,rnn_twitter,emotion,0.0
5,rnn_twitter,sentiment,0.0
6,ffn_twitter,irony,0.0
7,ffn_twitter,emotion,0.0
8,ffn_twitter,sentiment,0.001


In [271]:
multipletests(df_pvals.pval, alpha=0.05, method="fdr_bh")[1]


array([0.03783079, 0.00011743, 0.00011743, 0.00011743, 0.00011743,
       0.00011743, 0.00011743, 0.00011743, 0.00073966])