# Results

In this notebook we explore the results of the experiments, which are stored in wandb

In [1]:
# Get wandb api object
import wandb

api = wandb.Api()

# Get all runs
project = api.project("pysentimiento")


# Get all runs for sentiment and lang es
runs = [r for r in api.runs("pysentimiento", {"config.lang": "pt"}) if not r.sweep]

In [2]:
data = []

renames = {
    "pysentimiento/robertuito-base-uncased": "RoBERTuito",
    "pablocosta/bertabaporu-base-uncased": "BERTabaporu",
    'neuralmind/bert-base-portuguese-cased': "BERT-pt",
    'melll-uff/bertweetbr': "BERTweet-BR",
}

for run in runs:
    # Get model name
    model_name = run.config["model"]
    # Get task 
    task = run.config["task"]
    # Get Macro f1 performance
    try:
        macro_f1 = run.summary["test_macro_f1"]
    except KeyError:
        print(f"Run {run.name} has no macro f1")
        print(run.summary)
        continue

    data.append({
        "model": model_name,
        "task": task,
        "macro_f1": macro_f1,
    })

Run bumbling-hill-722 has no macro f1
{'_wandb': {'runtime': 0}}
Run driven-sweep-7 has no macro f1
{'eval/macro_f1': 0.4783643782138825, 'train/global_step': 1200, 'eval/neu_recall': 0.18312101910828024, 'eval/pos_recall': 0.8392857142857143, 'eval/macro_precision': 0.5230864882469177, 'eval/steps_per_second': 7.078, 'train/learning_rate': 0, 'eval/loss': 0.9725472927093506, 'train/loss': 0.8413, 'eval/pos_f1': 0.6701688555347092, 'eval/neg_precision': 0.6149312377210217, 'eval/neu_precision': 0.39655172413793105, 'train/train_loss': 0.9332966486612956, 'train/train_runtime': 663.1867, 'eval/samples_per_second': 226.495, '_step': 8, '_wandb': {'runtime': 685}, '_runtime': 685.9872334003448, 'eval/neu_f1': 0.25054466230936817, 'eval/pos_precision': 0.5577763897564022, '_timestamp': 1677192014.9587574, 'train/epoch': 4, 'eval/runtime': 10.5963, 'eval/neg_recall': 0.442090395480226, 'eval/acc': 0.5504166666666667, 'eval/neg_f1': 0.514379622021364, 'eval/micro_f1': 0.5504166666666667, 'tr

In [3]:
import pandas as pd

df = pd.DataFrame(data)

df["model"] = df["model"].map(lambda x: renames[x])

# Group by model and task

grouped = df.groupby(["task", "model"])

# Get mean macro f1 plus standard deviation in a single line with +- inbetween

df_results = (grouped.mean() * 100).round(1).astype(str) + " +- " + (grouped.std() * 100).round(1).astype(str)

df_results = df_results.unstack("task")
df_results.columns = df_results.columns.droplevel(0)

df_results

task,emotion,hate_speech,sentiment
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BERT-pt,44.4 +- 0.6,64.1 +- 1.1,70.0 +- 0.3
BERTabaporu,43.9 +- 0.5,70.3 +- 3.3,73.8 +- 0.4
BERTweet-BR,43.2 +- 1.1,55.6 +- 5.5,75.3 +- 0.5
RoBERTuito,45.2 +- 0.6,70.0 +- 2.4,71.7 +- 0.4


In [7]:
print(df_results[["emotion", "sentiment", "hate_speech"]].to_markdown())

| model       | emotion     | sentiment   | hate_speech   |
|:------------|:------------|:------------|:--------------|
| BERT-pt     | 44.4 +- 0.6 | 70.0 +- 0.3 | 64.1 +- 1.1   |
| BERTabaporu | 43.9 +- 0.5 | 73.8 +- 0.4 | 70.3 +- 3.3   |
| BERTweet-BR | 43.2 +- 1.1 | 75.3 +- 0.5 | 55.6 +- 5.5   |
| RoBERTuito  | 45.2 +- 0.6 | 71.7 +- 0.4 | 70.0 +- 2.4   |


In [8]:
mean_df = pd.DataFrame(data).groupby(["model", "task"]).mean()
std_df = pd.DataFrame(data).groupby(["model", "task"]).std()

# Concat mean and std
df = pd.concat([mean_df, std_df], axis=1)

df.columns = ["mean macro f1", "std macro f1"]

df = df * 100

df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean macro f1,std macro f1
model,task,Unnamed: 2_level_1,Unnamed: 3_level_1
melll-uff/bertweetbr,emotion,43.225703,1.103411
melll-uff/bertweetbr,hate_speech,55.641053,5.467568
melll-uff/bertweetbr,sentiment,75.29541,0.486638
neuralmind/bert-base-portuguese-cased,emotion,44.37163,0.57071
neuralmind/bert-base-portuguese-cased,hate_speech,64.147,1.099126
neuralmind/bert-base-portuguese-cased,sentiment,69.973505,0.337301
pablocosta/bertabaporu-base-uncased,emotion,43.881166,0.515098
pablocosta/bertabaporu-base-uncased,hate_speech,70.3105,3.260516
pablocosta/bertabaporu-base-uncased,sentiment,73.777885,0.404985
pysentimiento/robertuito-base-uncased,emotion,45.168451,0.633867


In [9]:
df.reset_index().to_csv("../data/results_pt.csv", index=False)

In [10]:
df = pd.read_csv("../data/results_pt.csv").set_index(["model", "task"])

In [11]:
df.loc[("pysentimiento/robertuito-base-uncased", "hate_speech"), "mean macro f1"]

69.95333909988403