# Results

In this notebook we explore the results of the experiments, which are stored in wandb

In [1]:
# Get wandb api object
import wandb

api = wandb.Api()

# Get all runs
project = api.project("pysentimiento")


# Get all runs for sentiment and lang es
es_runs = [r for r in api.runs("pysentimiento", {"config.lang": "es"}) if not r.sweep]
en_runs = [r for r in api.runs("pysentimiento", {"config.lang": "en"}) if not r.sweep]


## Spanish results

In [2]:
data = []

renames = {
    "PlanTL-GOB-ES/roberta-base-bne": "roberta",
    "bertin-project/bertin-roberta-base-spanish": "bertin",
    "dccuchile/bert-base-spanish-wwm-cased": "beto",
    "mrm8488/electricidad-base-discriminator": "electricidad",
    "pysentimiento/robertuito-base-uncased": "robertuito",
}

for run in es_runs:
    # Get model name
    model_name = run.config["model"]
    # Get task 
    task = run.config["task"]
    # Get Macro f1 performance
    macro_f1 = run.summary["test_macro_f1"]

    data.append({
        "model": renames[model_name],
        "task": task,
        "macro_f1": macro_f1,
    })

In [3]:
import pandas as pd

df = pd.DataFrame(data)

# Group by model and task

grouped = df.groupby(["task", "model"])

# Get mean macro f1 plus standard deviation in a single line with +- inbetween

df_results = (grouped.mean() * 100).round(1).astype(str) + " +- " + (grouped.std() * 100).round(1).astype(str)

df_results = df_results.unstack("task")
df_results.columns = df_results.columns.droplevel(0)

df_results

task,emotion,hate_speech,irony,sentiment
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bertin,50.2 +- 2.9,68.7 +- 1.5,69.3 +- 1.4,65.3 +- 0.5
beto,52.2 +- 1.4,73.3 +- 0.8,71.5 +- 0.5,67.2 +- 0.6
electricidad,46.3 +- 2.3,71.8 +- 1.0,67.1 +- 2.1,65.3 +- 0.5
roberta,53.1 +- 2.2,73.1 +- 2.8,71.9 +- 0.9,67.3 +- 0.3
robertuito,55.3 +- 0.8,76.1 +- 0.5,74.1 +- 0.7,70.2 +- 0.2


In [5]:
print(df_results.to_markdown())

| model        | emotion     | hate_speech   | irony       | sentiment   |
|:-------------|:------------|:--------------|:------------|:------------|
| bertin       | 50.2 +- 2.9 | 68.7 +- 1.5   | 69.3 +- 1.4 | 65.3 +- 0.5 |
| beto         | 52.2 +- 1.4 | 73.3 +- 0.8   | 71.5 +- 0.5 | 67.2 +- 0.6 |
| electricidad | 46.3 +- 2.3 | 71.8 +- 1.0   | 67.1 +- 2.1 | 65.3 +- 0.5 |
| roberta      | 53.1 +- 2.2 | 73.1 +- 2.8   | 71.9 +- 0.9 | 67.3 +- 0.3 |
| robertuito   | 55.3 +- 0.8 | 76.1 +- 0.5   | 74.1 +- 0.7 | 70.2 +- 0.2 |


## English results

In [6]:
data = []

renames = {
    "vinai/bertweet-base": "bertweet",
    "roberta-base": "roberta",
    "bert-base-uncased": "bert",
    "google/electra-base-discriminator": "electra",
    "pysentimiento/robertuito-base-uncased": "robertuito",
}

odd_runs = []

for run in en_runs:
    # Get model name
    model_name = run.config["model"]
    # Get task 
    task = run.config["task"]
    
    # Get Macro f1 performance
    try:
        macro_f1 = run.summary["test_macro_f1"]
    except KeyError:
        print(run.name)
        odd_runs.append(run)
        continue 
    
    data.append({
        "model": renames[model_name],
        "task": task,
        "macro_f1": macro_f1,
    })

electric-sweep-12
glowing-sweep-11
wobbly-sweep-10
clear-sweep-9
woven-sweep-8
vocal-sweep-7
rural-sweep-6
logical-sweep-5
autumn-sweep-4
cerulean-sweep-3
earnest-sweep-2
ethereal-sweep-1


In [7]:
df = pd.DataFrame(data)

# Group by model and task

grouped = df.groupby(["task", "model"])

# Get mean macro f1 plus standard deviation in a single line with +- inbetween

df_results = (grouped.mean() * 100).round(1).astype(str) + " +- " + (grouped.std() * 100).round(1).astype(str)

df_results = df_results.unstack("task")
df_results.columns = df_results.columns.droplevel(0)

df_results

task,emotion,hate_speech,irony,sentiment
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert,42.7 +- 0.6,56.0 +- 0.8,68.1 +- 2.2,69.6 +- 0.4
bertweet,43.1 +- 1.8,57.7 +- 0.7,80.8 +- 0.7,72.0 +- 0.4
electra,37.2 +- 2.9,55.6 +- 0.6,71.3 +- 1.8,70.9 +- 0.4
roberta,45.0 +- 0.9,55.1 +- 0.4,70.4 +- 2.9,70.4 +- 0.3
robertuito,43.0 +- 3.3,57.5 +- 0.2,73.9 +- 1.4,69.6 +- 0.5


In [8]:
print(df_results.to_markdown())

| model      | emotion     | hate_speech   | irony       | sentiment   |
|:-----------|:------------|:--------------|:------------|:------------|
| bert       | 42.7 +- 0.6 | 56.0 +- 0.8   | 68.1 +- 2.2 | 69.6 +- 0.4 |
| bertweet   | 43.1 +- 1.8 | 57.7 +- 0.7   | 80.8 +- 0.7 | 72.0 +- 0.4 |
| electra    | 37.2 +- 2.9 | 55.6 +- 0.6   | 71.3 +- 1.8 | 70.9 +- 0.4 |
| roberta    | 45.0 +- 0.9 | 55.1 +- 0.4   | 70.4 +- 2.9 | 70.4 +- 0.3 |
| robertuito | 43.0 +- 3.3 | 57.5 +- 0.2   | 73.9 +- 1.4 | 69.6 +- 0.5 |
