# Results

In this notebook we explore the results of the experiments, which are stored in wandb

In [1]:
# Get wandb api object
import wandb

api = wandb.Api()

# Get all runs
project = api.project("pysentimiento")


# Get all runs for sentiment and lang es
runs = [r for r in api.runs("pysentimiento", {"config.lang": "en"}) if not r.sweep]



In [2]:
data = []

renames = {
    "vinai/bertweet-base": "bertweet",
    "roberta-base": "roberta",
    "bert-base-uncased": "bert",
    "google/electra-base-discriminator": "electra",
    "pysentimiento/robertuito-base-uncased": "robertuito",
}

odd_runs = []

for run in runs:
    # Get model name
    model_name = run.config["model"]
    # Get task 
    task = run.config["task"]
    
    # Get Macro f1 performance
    try:
        macro_f1 = run.summary["test_macro_f1"]
    except KeyError:
        print(run.name)
        odd_runs.append(run)
        continue 
    
    data.append({
        "model": model_name,
        "task": task,
        "macro_f1": macro_f1,
    })

electric-sweep-12
glowing-sweep-11
wobbly-sweep-10
clear-sweep-9
woven-sweep-8
vocal-sweep-7
rural-sweep-6
logical-sweep-5
autumn-sweep-4
cerulean-sweep-3
earnest-sweep-2
ethereal-sweep-1


In [3]:
import pandas as pd

df = pd.DataFrame(data)

df["model"] = df["model"].map(renames)
# Group by model and task

grouped = df.groupby(["task", "model"])

# Get mean macro f1 plus standard deviation in a single line with +- inbetween

df_results = (grouped.mean() * 100).round(1).astype(str) + " +- " + (grouped.std() * 100).round(1).astype(str)

df_results = df_results.unstack("task")
df_results.columns = df_results.columns.droplevel(0)

df_results

task,emotion,hate_speech,irony,sentiment
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert,48.0 +- 0.5,56.0 +- 0.8,68.1 +- 2.2,69.6 +- 0.4
bertweet,44.3 +- 1.1,57.7 +- 0.7,80.8 +- 0.7,72.0 +- 0.4
electra,46.4 +- 0.7,55.6 +- 0.6,71.3 +- 1.8,70.9 +- 0.4
roberta,47.9 +- 1.2,55.1 +- 0.4,70.4 +- 2.9,70.4 +- 0.3
robertuito,44.2 +- 6.1,57.5 +- 0.2,73.9 +- 1.4,69.6 +- 0.5


In [4]:
mean_df = pd.DataFrame(data).groupby(["model", "task"]).mean()
std_df = pd.DataFrame(data).groupby(["model", "task"]).std()

# Concat mean and std
df = pd.concat([mean_df, std_df], axis=1)

df.columns = ["mean macro f1", "std macro f1"]

df = df * 100

df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean macro f1,std macro f1
model,task,Unnamed: 2_level_1,Unnamed: 3_level_1
bert-base-uncased,emotion,48.026253,0.518994
bert-base-uncased,hate_speech,55.966784,0.776115
bert-base-uncased,irony,68.110517,2.194008
bert-base-uncased,sentiment,69.566153,0.389755
google/electra-base-discriminator,emotion,46.373433,0.652218
google/electra-base-discriminator,hate_speech,55.566145,0.590012
google/electra-base-discriminator,irony,71.265484,1.829708
google/electra-base-discriminator,sentiment,70.948625,0.442205
pysentimiento/robertuito-base-uncased,emotion,44.211502,6.068735
pysentimiento/robertuito-base-uncased,hate_speech,57.524815,0.240745


In [8]:
print(df_results[["sentiment", "emotion", "hate_speech", "irony"]].to_markdown())

| model      | sentiment   | emotion     | hate_speech   | irony       |
|:-----------|:------------|:------------|:--------------|:------------|
| bert       | 69.6 +- 0.4 | 48.0 +- 0.5 | 56.0 +- 0.8   | 68.1 +- 2.2 |
| bertweet   | 72.0 +- 0.4 | 44.3 +- 1.1 | 57.7 +- 0.7   | 80.8 +- 0.7 |
| electra    | 70.9 +- 0.4 | 46.4 +- 0.7 | 55.6 +- 0.6   | 71.3 +- 1.8 |
| roberta    | 70.4 +- 0.3 | 47.9 +- 1.2 | 55.1 +- 0.4   | 70.4 +- 2.9 |
| robertuito | 69.6 +- 0.5 | 44.2 +- 6.1 | 57.5 +- 0.2   | 73.9 +- 1.4 |


In [5]:
df.reset_index().to_csv("../data/results_en.csv", index=False)

In [7]:
df = pd.read_csv("../data/results_en.csv").set_index(["model", "task"])

df.loc[("pysentimiento/robertuito-base-uncased", "emotion"), "mean macro f1"]

44.21150196682323