# Results

In this notebook we explore the results of the experiments, which are stored in wandb

In [1]:
# Get wandb api object
import wandb

api = wandb.Api()

# Get all runs
project = api.project("pysentimiento")


# Get all runs for sentiment and lang es
es_runs = [r for r in api.runs("pysentimiento", {"config.lang": "es"}) if not r.sweep]



## Spanish results

In [3]:
data = []

renames = {
    "PlanTL-GOB-ES/roberta-base-bne": "roberta",
    "bertin-project/bertin-roberta-base-spanish": "bertin",
    "dccuchile/bert-base-spanish-wwm-cased": "beto",
    "mrm8488/electricidad-base-discriminator": "electricidad",
    "pysentimiento/robertuito-base-uncased": "robertuito",
}

for run in es_runs:
    # Get model name
    model_name = run.config["model"]
    # Get task 
    task = run.config["task"]
    # Get Macro f1 performance
    macro_f1 = run.summary["test_macro_f1"]

    data.append({
        "model": model_name,
        "task": task,
        "macro_f1": macro_f1,
    })

In [5]:
import pandas as pd

df = pd.DataFrame(data)

# Group by model and task
df["model"] = df["model"].map(renames)
grouped = df.groupby(["task", "model"])

# Get mean macro f1 plus standard deviation in a single line with +- inbetween

df_results = (grouped.mean() * 100).round(1).astype(str) + " +- " + (grouped.std() * 100).round(1).astype(str)

df_results = df_results.unstack("task")
df_results.columns = df_results.columns.droplevel(0)

df_results[["sentiment", "emotion", "hate_speech", "irony"]]

task,sentiment,emotion,hate_speech,irony
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bertin,65.3 +- 0.5,50.2 +- 2.9,68.7 +- 1.5,69.3 +- 1.4
beto,67.2 +- 0.6,52.2 +- 1.4,73.3 +- 0.8,71.5 +- 0.5
electricidad,65.3 +- 0.5,46.3 +- 2.3,71.8 +- 1.0,67.1 +- 2.1
roberta,67.3 +- 0.3,53.1 +- 2.2,73.1 +- 2.8,71.9 +- 0.9
robertuito,70.2 +- 0.2,55.3 +- 0.8,76.1 +- 0.5,74.1 +- 0.7


In [6]:
print(df_results.to_markdown())

| model        | emotion     | hate_speech   | irony       | sentiment   |
|:-------------|:------------|:--------------|:------------|:------------|
| bertin       | 50.2 +- 2.9 | 68.7 +- 1.5   | 69.3 +- 1.4 | 65.3 +- 0.5 |
| beto         | 52.2 +- 1.4 | 73.3 +- 0.8   | 71.5 +- 0.5 | 67.2 +- 0.6 |
| electricidad | 46.3 +- 2.3 | 71.8 +- 1.0   | 67.1 +- 2.1 | 65.3 +- 0.5 |
| roberta      | 53.1 +- 2.2 | 73.1 +- 2.8   | 71.9 +- 0.9 | 67.3 +- 0.3 |
| robertuito   | 55.3 +- 0.8 | 76.1 +- 0.5   | 74.1 +- 0.7 | 70.2 +- 0.2 |


In [7]:
mean_df = pd.DataFrame(data).groupby(["model", "task"]).mean()
std_df = pd.DataFrame(data).groupby(["model", "task"]).std()

# Concat mean and std
df = pd.concat([mean_df, std_df], axis=1)

df.columns = ["mean macro f1", "std macro f1"]

df = df * 100

df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean macro f1,std macro f1
model,task,Unnamed: 2_level_1,Unnamed: 3_level_1
PlanTL-GOB-ES/roberta-base-bne,emotion,53.108638,2.241176
PlanTL-GOB-ES/roberta-base-bne,hate_speech,73.076665,2.752051
PlanTL-GOB-ES/roberta-base-bne,irony,71.900387,0.853217
PlanTL-GOB-ES/roberta-base-bne,sentiment,67.327242,0.259381
bertin-project/bertin-roberta-base-spanish,emotion,50.179562,2.937298
bertin-project/bertin-roberta-base-spanish,hate_speech,68.71949,1.454827
bertin-project/bertin-roberta-base-spanish,irony,69.312528,1.393909
bertin-project/bertin-roberta-base-spanish,sentiment,65.26136,0.533775
dccuchile/bert-base-spanish-wwm-cased,emotion,52.157551,1.362333
dccuchile/bert-base-spanish-wwm-cased,hate_speech,73.295649,0.848453


In [8]:
df.reset_index().to_csv("../data/results_es.csv", index=False)

In [9]:
df = pd.read_csv("../data/results_es.csv").set_index(["model", "task"])

df.loc[("pysentimiento/robertuito-base-uncased", "emotion"), "mean macro f1"]

55.2943515777588