# Results

In this notebook we explore the results of the experiments, which are stored in wandb

In [1]:
# Get wandb api object
import wandb

api = wandb.Api()

# Get all runs
project = api.project("pysentimiento")


# Get all runs for sentiment and lang es
runs = [r for r in api.runs("pysentimiento", {"config.task": "targeted_sa"}) if not r.sweep]



In [2]:
len(runs)

275

## Spanish results

In [3]:
data = []

renames = {
    "PlanTL-GOB-ES/roberta-base-bne": "roberta",
    "bertin-project/bertin-roberta-base-spanish": "bertin",
    "dccuchile/bert-base-spanish-wwm-cased": "beto",
    "mrm8488/electricidad-base-discriminator": "electricidad",
    "pysentimiento/robertuito-base-uncased": "robertuito",
    "pysentimiento/robertuito-sentiment-analysis": "robertuito-sentiment",
    "pysentimiento/roberta-es-sentiment": "roberta-sentiment",
}

for run in runs:
    # Get model name
    model_name = run.config["model"]
    # Get task 
    task = run.config["task"]
    # Get Macro f1 performance
    try:
        pos_f1 = run.summary["test_pos_f1"]
        neg_f1 = run.summary["test_neg_f1"]
        neu_f1 = run.summary["test_neu_f1"]
        macro_f1 = run.summary["test_macro_f1"]
    except KeyError as e:
        print(f"Skipping {model_name} {task} because it does not have all metrics")
        print(e)
        continue
    data.append({
        "model": model_name,
        "macro_f1": macro_f1,
        "pos_f1": pos_f1,
        "neg_f1": neg_f1,
        "neu_f1": neu_f1,
    })

Skipping pysentimiento/robertuito-sentiment-analysis targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping pysentimiento/roberta-es-sentiment_untargeted targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping PlanTL-GOB-ES/roberta-base-bne_untargeted targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping dccuchile/bert-base-spanish-wwm-cased_untargeted targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping pysentimiento/robertuito-base-uncased_untargeted targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping dccuchile/bert-base-spanish-wwm-cased targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping mrm8488/electricidad-base-discriminator targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping dccuchile/bert-base-spanish-wwm-cased targeted_sa because it does not have all metrics
'test_pos_f1'
Skipping pysentimiento/robertuito-base-uncased targeted_sa because it does not

In [4]:
len(data)

261

In [5]:
import pandas as pd

df = pd.DataFrame(data)

# Group by model and task

def rename_model(model_name):
    if model_name.endswith("_untargeted"):
        model, _ = model_name.split("_untargeted")
        return f"{renames[model]} (untargeted)"
    else:
        return renames[model_name]

df["model"] = df["model"].map(rename_model)
grouped = df.groupby(["model"])

# Get mean macro f1 plus standard deviation in a single line with +- inbetween

df_results = (grouped.mean() * 100).round(1).astype(str) + " +- " + (grouped.std() * 100).round(1).astype(str)

#df_results = df_results.unstack("task")
#df_results.columns = df_results.columns.droplevel(0)

df_results[["pos_f1", "neg_f1", "neu_f1", "macro_f1"]]

Unnamed: 0_level_0,pos_f1,neg_f1,neu_f1,macro_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bertin,67.3 +- 5.6,63.6 +- 5.5,58.0 +- 5.5,63.0 +- 4.7
bertin (untargeted),68.6 +- 3.6,65.3 +- 4.5,63.2 +- 4.6,65.7 +- 3.3
beto,73.8 +- 2.1,72.9 +- 3.3,65.9 +- 4.5,70.9 +- 2.3
beto (untargeted),70.7 +- 3.7,66.9 +- 4.4,64.4 +- 3.8,67.3 +- 3.1
electricidad,67.3 +- 1.3,63.1 +- 6.0,55.2 +- 4.9,61.9 +- 2.7
electricidad (untargeted),68.0 +- 2.8,61.4 +- 5.5,59.3 +- 3.5,62.9 +- 3.0
roberta,75.3 +- 3.9,74.2 +- 2.9,68.5 +- 4.7,72.7 +- 2.6
roberta (untargeted),72.3 +- 3.5,68.6 +- 4.5,66.8 +- 4.0,69.3 +- 2.9
roberta-sentiment,75.9 +- 1.5,74.8 +- 3.3,67.0 +- 3.3,72.6 +- 2.0
roberta-sentiment (untargeted),70.4 +- 2.7,68.3 +- 2.7,68.3 +- 3.9,69.0 +- 2.2


In [6]:
print(df_results.to_markdown())

| model                             | macro_f1    | pos_f1      | neg_f1      | neu_f1      |
|:----------------------------------|:------------|:------------|:------------|:------------|
| bertin                            | 63.0 +- 4.7 | 67.3 +- 5.6 | 63.6 +- 5.5 | 58.0 +- 5.5 |
| bertin (untargeted)               | 65.7 +- 3.3 | 68.6 +- 3.6 | 65.3 +- 4.5 | 63.2 +- 4.6 |
| beto                              | 70.9 +- 2.3 | 73.8 +- 2.1 | 72.9 +- 3.3 | 65.9 +- 4.5 |
| beto (untargeted)                 | 67.3 +- 3.1 | 70.7 +- 3.7 | 66.9 +- 4.4 | 64.4 +- 3.8 |
| electricidad                      | 61.9 +- 2.7 | 67.3 +- 1.3 | 63.1 +- 6.0 | 55.2 +- 4.9 |
| electricidad (untargeted)         | 62.9 +- 3.0 | 68.0 +- 2.8 | 61.4 +- 5.5 | 59.3 +- 3.5 |
| roberta                           | 72.7 +- 2.6 | 75.3 +- 3.9 | 74.2 +- 2.9 | 68.5 +- 4.7 |
| roberta (untargeted)              | 69.3 +- 2.9 | 72.3 +- 3.5 | 68.6 +- 4.5 | 66.8 +- 4.0 |
| roberta-sentiment                 | 72.6 +- 2.0 | 75.9 +- 

In [6]:
print(df_results.to_latex())

\begin{tabular}{lllll}
\toprule
{} &     macro\_f1 &       pos\_f1 &       neg\_f1 &       neu\_f1 \\
model                             &              &              &              &              \\
\midrule
bertin                            &  63.0 +- 4.7 &  67.3 +- 5.6 &  63.6 +- 5.5 &  58.0 +- 5.5 \\
bertin (untargeted)               &  65.7 +- 3.3 &  68.6 +- 3.6 &  65.3 +- 4.5 &  63.2 +- 4.6 \\
beto                              &  70.9 +- 2.3 &  73.8 +- 2.1 &  72.9 +- 3.3 &  65.9 +- 4.5 \\
beto (untargeted)                 &  67.3 +- 3.1 &  70.7 +- 3.7 &  66.9 +- 4.4 &  64.4 +- 3.8 \\
electricidad                      &  61.9 +- 2.7 &  67.3 +- 1.3 &  63.1 +- 6.0 &  55.2 +- 4.9 \\
electricidad (untargeted)         &  62.9 +- 3.0 &  68.0 +- 2.8 &  61.4 +- 5.5 &  59.3 +- 3.5 \\
roberta                           &  72.7 +- 2.6 &  75.3 +- 3.9 &  74.2 +- 2.9 &  68.5 +- 4.7 \\
roberta (untargeted)              &  69.3 +- 2.9 &  72.3 +- 3.5 &  68.6 +- 4.5 &  66.8 +- 4.0 \\
roberta-sentimen

  print(df_results.to_latex())


## Statistical tests

In [54]:
import scipy.stats


pvals = []

for model_name in renames.values():
    print("=" * 80)
    print(model_name)

    untargeted_model_name = f"{model_name} (untargeted)"

    targeted_results = df[df["model"] == model_name]
    untargeted_results = df[df["model"] == untargeted_model_name]

    # Perform u-test for each metric

    columns = ["pos_f1", "neg_f1", "neu_f1", "macro_f1"]
    
    for col in columns:
        pval = scipy.stats.mannwhitneyu(
            x=targeted_results[col].values,
            y=untargeted_results[col].values,
            alternative="greater"
        )
        pvals.append((model_name, col, pval.pvalue))

df_pvals = pd.DataFrame(pvals, columns=["model", "metric", "pvalue"])

#Benjamini-Hochberg correction
from statsmodels.stats import multitest


df_pvals["pvalue"] = multitest.multipletests(list(df_pvals["pvalue"]), method="fdr_bh")[1]

# Make the metrics the columns
df_pvals = df_pvals.pivot(index="model", columns="metric", values="pvalue")




df_pvals


roberta
bertin
beto
electricidad
robertuito


metric,macro_f1,neg_f1,neu_f1,pos_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bertin,0.962951,0.932949,0.962951,0.930536
beto,0.000362,0.000107,0.051242,0.005011
electricidad,0.931564,0.398598,0.962951,0.930536
roberta,0.002362,0.000826,0.113422,0.011662
robertuito,0.003297,0.00033,0.743923,0.019677


array([1.16619809e-02, 8.26346515e-04, 1.13421654e-01, 2.36159383e-03,
       9.30536131e-01, 9.32949404e-01, 9.62951259e-01, 9.62951259e-01,
       5.01079868e-03, 1.06859427e-04, 5.12416000e-02, 3.62359600e-04,
       9.30536131e-01, 3.98598049e-01, 9.62951259e-01, 9.31564322e-01,
       1.96771363e-02, 3.29521701e-04, 7.43922982e-01, 3.29749677e-03])