In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import json
from glob import glob
from pathlib import Path
import os


In [2]:
list(Path("../").glob("./*"))

[WindowsPath('../.git'),
 WindowsPath('../.gitignore'),
 WindowsPath('../ALL'),
 WindowsPath('../data'),
 WindowsPath('../ENG'),
 WindowsPath('../HIN'),
 WindowsPath('../IBEN'),
 WindowsPath('../LICENSE'),
 WindowsPath('../notebooks'),
 WindowsPath('../README.md'),
 WindowsPath('../src')]

In [3]:
langs = ["./ENG","./HIN","./IBEN","./ALL"]
filepaths = sum([
    [Path(v) for v in Path("../",lang).glob("./**/output/**/*_results.json")]
    for lang in langs
], [])
filepaths

[WindowsPath('../ENG/Sub-task A/output/bert-base-cased/dev_results.json'),
 WindowsPath('../ENG/Sub-task A/output/bert-base-cased/train_results.json'),
 WindowsPath('../ENG/Sub-task A/output/bert-base-uncased/dev_results.json'),
 WindowsPath('../ENG/Sub-task A/output/bert-base-uncased/train_results.json'),
 WindowsPath('../ENG/Sub-task A/output/xlm-roberta-base/dev_results.json'),
 WindowsPath('../ENG/Sub-task A/output/xlm-roberta-base/train_results.json'),
 WindowsPath('../ENG/Sub-task B/output/bert-base-cased/dev_results.json'),
 WindowsPath('../ENG/Sub-task B/output/bert-base-cased/train_results.json'),
 WindowsPath('../ENG/Sub-task B/output/bert-base-uncased/dev_results.json'),
 WindowsPath('../ENG/Sub-task B/output/bert-base-uncased/train_results.json'),
 WindowsPath('../ENG/Sub-task B/output/xlm-roberta-base/dev_results.json'),
 WindowsPath('../ENG/Sub-task B/output/xlm-roberta-base/train_results.json'),
 WindowsPath('../ENG/Sub-task C/output/bert-base-cased/dev_results.json'),
 

In [4]:
def get_filestats(filepath):
    stats = []
    with open(filepath) as fp:
        for line in fp:
            stats.append(json.loads(line))
    return stats
stats = get_filestats(filepaths[0])
len(stats)

67

In [5]:
filepaths[0].parts[1:]

('ENG', 'Sub-task A', 'output', 'bert-base-cased', 'dev_results.json')

In [6]:
TASK_MAPPING={
    "Sub-task A": "A",
    "Sub-task B": "B",
    "Sub-task C": "C",
}

MODEL_DETAILS = {
    ("ENG", "A", "bert-base-cased"): ("1"),
    ("ENG", "B", "bert-base-cased"): ("1"),
    ("ENG", "C", "bert-base-cased"): ("3"),

    ("ENG", "A", "bert-base-uncased"): ("2"),
    ("ENG", "B", "bert-base-uncased"): ("2"),
    ("ENG", "C", "bert-base-uncased"): ("4"),

    ("ENG", "A", "xlm-roberta-base"): ("5"),
    ("ENG", "B", "xlm-roberta-base"): ("5"),
    ("ENG", "C", "xlm-roberta-base"): ("6"),

    ("ENG", "A", "bert-base-multilingual-uncased"): ("7"),
    ("ENG", "B", "bert-base-multilingual-uncased"): ("8"),

    ("HIN", "A", "bert-base-multilingual-uncased"): ("1"),
    ("HIN", "B", "bert-base-multilingual-uncased"): ("1"),
    ("HIN", "C", "bert-base-multilingual-uncased"): ("3"),

    ("HIN", "A", "xlm-roberta-base"): ("2"),
    ("HIN", "B", "xlm-roberta-base"): ("2"),
    ("HIN", "C", "xlm-roberta-base"): ("4"),

    ("HIN", "A", "bert-base-multilingual-uncased"): ("5"),
    ("HIN", "B", "bert-base-multilingual-uncased"): ("6"),

    ("IBEN", "A", "bert-base-multilingual-uncased"): ("1"),
    ("IBEN", "B", "bert-base-multilingual-uncased"): ("1"),
    ("IBEN", "C", "bert-base-multilingual-uncased"): ("3"),

    ("IBEN", "A", "xlm-roberta-base"): ("2"),
    ("IBEN", "B", "xlm-roberta-base"): ("2"),
    ("IBEN", "C", "xlm-roberta-base"): ("4"),

    ("IBEN", "A", "bert-base-multilingual-uncased"): ("5"),
    ("IBEN", "B", "bert-base-multilingual-uncased"): ("6"),
    
}


In [7]:
def get_model_C_stats_old(df, lang, model, task, split, run_id):
    lblsA = ['OAG','NAG','CAG']
    lblsB = ['GEN','NGEN']
    Albls = {i:l for i,l in enumerate(lblsA)}
    Blbls = {i:l for i,l in enumerate(lblsB)}
    for i in lblsA:
        df[i+'_probs'] = df[i+'-'+lblsB[0]+'_probs']+df[i+'-'+lblsB[1]+'_probs']  
    for i in lblsB:
        df[i+'_probs'] = df[lblsA[0]+'-'+i+'_probs']+df[lblsA[1]+'-'+i+'_probs']+df[lblsA[2]+'-'+i+'_probs']  
    a = pd.DataFrame({'val':np.argmax( [df[lblsA[0]+'_probs'] , df[lblsA[1]+'_probs'], df[lblsA[2]+'_probs']],axis=0)})
    b = pd.DataFrame({'val':np.argmax([df[lblsB[0]+'_probs'], df[lblsB[1]+'_probs']],axis=0)})
    df["A_preds"] = a['val'].map(Albls)
    df["B_preds"] = b['val'].map(Blbls)
    df["Sub-task C_mpreds"] = df["A_preds"]+'-'+df["B_preds"]

    tmp = {
        0 : "Sub-task C_preds",
        1 : "Sub-task C_mpreds"
    } 

    tmp1 = {
        0 : "C",
        1 : "M"
    }
    
    for j in range(2):
        for i, task in enumerate(["A", "B"]):
            report = classification_report(
                df["label"].str.split("-", expand=True)[i],
                df[tmp[j]].str.split("-", expand=True)[i],
                output_dict=True
            )
            final_stat = {
                "macro avg/f1-score": report["macro avg"]["f1-score"],
                "weighted avg/f1-score": report["weighted avg"]["f1-score"]
            }
            final_stat.update(dict(
                lang=lang, model=f"{model} ({tmp1[j]})", task=task, split=split, run_id=f"{run_id} ({tmp1[j]})"
            ))
            yield final_stat
            
def add_marginalized_predictions_for_subtask_C(df):
    column_names=df.filter(like='_probs', axis=1).columns.tolist()
    listA=list(set([
        x.split('-')[0] 
        for x in column_names]))
    listB=list(set([
        x[x.find('-')+1:x.find('_')] 
        for x in column_names]))
    for label in listA:
        filter_str=r'^{}-'.format(label)
        df[label+'_probs']=df.filter(regex=filter_str, axis=1).sum(axis=1)
    for label in listB:
        filter_str="-{}".format(label)
        df[label+'_probs']=df.filter(regex=filter_str, axis=1).sum(axis=1)
    prob_listA=[x+'_probs' for x in listA]
    prob_listB=[x+'_probs' for x in listB]
    
    df['Sub-task B_preds']=df[prob_listB].idxmax(axis=1).apply(
        lambda x: pd.Series(str(x).split('_'))[0])
    df['Sub-task A_preds']=df[prob_listA].idxmax(axis=1).apply(
        lambda x: pd.Series(str(x).split('_'))[0])
    df['Sub-task C_mpreds'] = df['Sub-task A_preds'].str.cat(df['Sub-task B_preds'], "-")
    return df
            
            
def get_model_C_stats(df, lang, model, task, split, run_id):
    df = add_marginalized_predictions_for_subtask_C(df)
    tmp = {
        0 : "Sub-task C_preds",
        1 : "Sub-task C_mpreds"
    } 

    tmp1 = {
        0 : "C",
        1 : "M"
    }
    
    for j in range(2):
        for i, task in enumerate(["A", "B"]):
            report = classification_report(
                df["label"].str.split("-", expand=True)[i],
                df[tmp[j]].str.split("-", expand=True)[i],
                output_dict=True
            )
            final_stat = {
                "macro avg/f1-score": report["macro avg"]["f1-score"],
                "weighted avg/f1-score": report["weighted avg"]["f1-score"]
            }
            final_stat.update(dict(
                lang=lang, model=f"{model} ({tmp1[j]})", task=task, split=split, run_id=f"{run_id} ({tmp1[j]})"
            ))
            yield final_stat

def get_model_ALL_stats(df, lang, model, task, split, run_id):
    for i, l in enumerate(["ENG", "HIN", "IBEN"]):
        #run_id = ""
        #    if lang == "DE" and task == "C":
        #        continue
        #    if i+1 in run_id_tasks:
        #        run_id = base_run_id
        pred_label = "Sub-task "+task+"_preds"
        actual_label = "label"
        df_t = df.loc[df["id"].str.split('-',expand=True)[0]==l].copy()
        if task != "C":
            report = classification_report(
                df_t[actual_label],
                df_t[pred_label],
                output_dict=True
            )
            final_stat = {
                "macro avg/f1-score": report["macro avg"]["f1-score"],
                "weighted avg/f1-score": report["weighted avg"]["f1-score"]
            }
            final_stat.update(dict(lang=l, model=model, task=task, split=split, run_id=run_id))
            yield final_stat
        else:
            yield from get_model_C_stats(df_t, l, model, task, split, run_id)

def get_final_stats(filepath):
    lang, task, _, model, basename = filepath.parts[1:]
    stats = get_filestats(filepath)
    split = basename.split("_")[0]
    task = TASK_MAPPING[task]
    default_model_details = ("", )
    run_id, = MODEL_DETAILS.get((lang, task, model), default_model_details)
    
    base_path = filepath.parent
    final_path = base_path / filepath.name.replace("_results.json", ".tsv")
    print(filepath, lang, model, task, split, run_id)
    
    df = pd.read_csv(final_path, sep="\t")
    if lang == "ALL":
        run_id = 9
        return [
            final_stat
            for final_stat in get_model_ALL_stats(
                df, lang=lang, model=f"{model} (ALL)", task=task, split=split, run_id=run_id
            )
        ]
    if task == "C":
        #print("C\n")
        return [
            final_stat
            for final_stat in get_model_C_stats(
                df, lang=lang, model=f"{model} (C)", task=task, split=split, run_id=run_id
            )
        ]
    final_stat = stats[-1]
    final_stat.update(dict(lang=lang, model=model, task=task, split=split, run_id=run_id))
    #print(lang,model,task)
    return [final_stat]


In [12]:
all_stats = %time sum((get_final_stats(filepath) for filepath in filepaths), [])
len(all_stats)
all_stats

..\ENG\Sub-task A\output\bert-base-cased\dev_results.json ENG bert-base-cased A dev 1
..\ENG\Sub-task A\output\bert-base-cased\train_results.json ENG bert-base-cased A train 1
..\ENG\Sub-task A\output\bert-base-uncased\dev_results.json ENG bert-base-uncased A dev 2
..\ENG\Sub-task A\output\bert-base-uncased\train_results.json ENG bert-base-uncased A train 2
..\ENG\Sub-task A\output\xlm-roberta-base\dev_results.json ENG xlm-roberta-base A dev 5
..\ENG\Sub-task A\output\xlm-roberta-base\train_results.json ENG xlm-roberta-base A train 5
..\ENG\Sub-task B\output\bert-base-cased\dev_results.json ENG bert-base-cased B dev 1
..\ENG\Sub-task B\output\bert-base-cased\train_results.json ENG bert-base-cased B train 1
..\ENG\Sub-task B\output\bert-base-uncased\dev_results.json ENG bert-base-uncased B dev 2
..\ENG\Sub-task B\output\bert-base-uncased\train_results.json ENG bert-base-uncased B train 2
..\ENG\Sub-task B\output\xlm-roberta-base\dev_results.json ENG xlm-roberta-base B dev 5
..\ENG\Sub-t

[{'eval_loss': 0.747566761339412,
  'OAG/precision': 0.5679012345679012,
  'OAG/recall': 0.40707964601769914,
  'OAG/f1-score': 0.4742268041237113,
  'OAG/support': 113,
  'NAG/precision': 0.8841463414634146,
  'NAG/recall': 0.8672248803827751,
  'NAG/f1-score': 0.8756038647342996,
  'NAG/support': 836,
  'CAG/precision': 0.2909090909090909,
  'CAG/recall': 0.41025641025641024,
  'CAG/f1-score': 0.3404255319148936,
  'CAG/support': 117,
  'accuracy': 0.7682926829268293,
  'macro avg/precision': 0.5809855556468023,
  'macro avg/recall': 0.5615203122189615,
  'macro avg/f1-score': 0.5634187335909683,
  'macro avg/support': 1066,
  'weighted avg/precision': 0.785511767923031,
  'weighted avg/recall': 0.7682926829268293,
  'weighted avg/f1-score': 0.7743173048948372,
  'weighted avg/support': 1066,
  'global_step': 335,
  'loss': 0.08183887004852294,
  'lang': 'ENG',
  'model': 'bert-base-cased',
  'task': 'A',
  'split': 'dev',
  'run_id': '1'},
 {'eval_loss': 0.05159350417887987,
  'OAG/

In [13]:
df_map = {
    k: pd.DataFrame([stats for stats in all_stats if stats["task"] == k])
    for k in ["A", "B",]
}
df_map["A"].head()

Unnamed: 0,eval_loss,OAG/precision,OAG/recall,OAG/f1-score,OAG/support,NAG/precision,NAG/recall,NAG/f1-score,NAG/support,CAG/precision,...,weighted avg/recall,weighted avg/f1-score,weighted avg/support,global_step,loss,lang,model,task,split,run_id
0,0.747567,0.567901,0.40708,0.474227,113.0,0.884146,0.867225,0.875604,836.0,0.290909,...,0.768293,0.774317,1066.0,335.0,0.081839,ENG,bert-base-cased,A,dev,1
1,0.051594,0.949886,0.958621,0.954233,435.0,0.996443,0.996148,0.996296,3375.0,0.951111,...,0.986864,0.986866,4263.0,335.0,0.081839,ENG,bert-base-cased,A,train,1
2,0.692995,0.666667,0.460177,0.544503,113.0,0.878394,0.889952,0.884135,836.0,0.276596,...,0.783302,0.784276,1066.0,335.0,0.162729,ENG,bert-base-uncased,A,dev,2
3,0.082985,0.908297,0.956322,0.931691,435.0,0.992604,0.994074,0.993338,3375.0,0.948235,...,0.979123,0.979042,4263.0,335.0,0.162729,ENG,bert-base-uncased,A,train,2
4,0.598518,0.428571,0.424779,0.426667,113.0,0.878363,0.898325,0.888232,836.0,0.30303,...,0.777674,0.772303,1066.0,1320.0,0.390615,ENG,xlm-roberta-base,A,dev,5


In [14]:
with pd.option_context("precision", 3):
    for k in ["A", "B"]:
        for l in ["ENG", "HIN", "IBEN"]:
            print(f"subtask={k}, lang={l}")
            df_t = df_map[k][df_map[k].lang==l][["lang", "task", "model", "run_id", "split", "macro avg/f1-score", "weighted avg/f1-score"]].rename(columns={
                "macro avg/f1-score": "macro", "weighted avg/f1-score": "weighted"
            }).pivot_table(index=["lang", "task", "model", "run_id"], columns=["split"])
            df_t["rank"] = df_t[("weighted", "dev")].rank(ascending=False).astype(int)
            display(df_t.sort_values("rank"))

subtask=A, lang=ENG


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train,Unnamed: 8_level_1
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ENG,A,bert-base-multilingual-uncased (ALL),9,0.611,0.903,0.798,0.957,1
ENG,A,bert-base-uncased (C) (C),4 (C),0.596,0.902,0.795,0.956,2
ENG,A,bert-base-uncased (C) (M),4 (M),0.595,0.9,0.795,0.956,3
ENG,A,bert-base-cased (C) (C),3 (C),0.571,0.912,0.786,0.961,4
ENG,A,bert-base-uncased,2,0.577,0.948,0.784,0.979,5
ENG,A,bert-base-cased (C) (M),3 (M),0.568,0.908,0.782,0.96,6
ENG,A,bert-base-multilingual-uncased (ALL) (M),9 (M),0.555,0.865,0.78,0.939,7
ENG,A,bert-base-multilingual-uncased (ALL) (C),9 (C),0.55,0.871,0.778,0.941,8
ENG,A,bert-base-cased,1,0.563,0.966,0.774,0.987,9
ENG,A,xlm-roberta-base,5,0.531,0.676,0.772,0.862,10


subtask=A, lang=HIN


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train,Unnamed: 8_level_1
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
HIN,A,bert-base-multilingual-uncased,5,0.637,0.846,0.708,0.881,1
HIN,A,bert-base-multilingual-uncased (ALL) (C),9 (C),0.628,0.903,0.696,0.924,2
HIN,A,bert-base-multilingual-uncased (ALL) (M),9 (M),0.626,0.899,0.695,0.921,3
HIN,A,bert-base-multilingual-uncased (ALL),9,0.626,0.939,0.694,0.952,4
HIN,A,bert-base-multilingual-uncased (C) (C),3 (C),0.616,0.849,0.688,0.884,5
HIN,A,bert-base-multilingual-uncased (C) (M),3 (M),0.611,0.848,0.684,0.884,6
HIN,A,xlm-roberta-base (ALL),9,0.598,0.698,0.672,0.753,7
HIN,A,xlm-roberta-base,2,0.394,0.388,0.527,0.509,8
HIN,A,xlm-roberta-base (C) (C),4 (C),0.245,0.24,0.426,0.406,9
HIN,A,xlm-roberta-base (C) (M),4 (M),0.245,0.24,0.426,0.406,9


subtask=A, lang=IBEN


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train,Unnamed: 8_level_1
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
IBEN,A,bert-base-multilingual-uncased (ALL),9,0.698,0.933,0.737,0.945,1
IBEN,A,xlm-roberta-base (C) (M),4 (M),0.694,0.758,0.732,0.796,2
IBEN,A,xlm-roberta-base (C) (C),4 (C),0.692,0.757,0.731,0.796,3
IBEN,A,bert-base-multilingual-uncased (C) (M),3 (M),0.686,0.856,0.729,0.879,4
IBEN,A,bert-base-multilingual-uncased (C) (C),3 (C),0.684,0.86,0.728,0.883,5
IBEN,A,bert-base-multilingual-uncased,5,0.68,0.903,0.726,0.918,6
IBEN,A,bert-base-multilingual-uncased (ALL) (C),9 (C),0.686,0.893,0.726,0.912,7
IBEN,A,bert-base-multilingual-uncased (ALL) (M),9 (M),0.683,0.893,0.723,0.913,8
IBEN,A,xlm-roberta-base (ALL),9,0.663,0.728,0.71,0.767,9
IBEN,A,xlm-roberta-base,2,0.584,0.631,0.646,0.691,10


subtask=B, lang=ENG


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train,Unnamed: 8_level_1
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ENG,B,bert-base-uncased (C) (M),4 (M),0.757,0.92,0.943,0.978,1
ENG,B,xlm-roberta-base (ALL),9,0.765,0.878,0.941,0.968,2
ENG,B,bert-base-multilingual-uncased (ALL) (M),9 (M),0.76,0.939,0.94,0.983,3
ENG,B,bert-base-uncased (C) (C),4 (C),0.734,0.914,0.939,0.977,4
ENG,B,bert-base-cased (C) (C),3 (C),0.729,0.931,0.939,0.982,5
ENG,B,bert-base-multilingual-uncased (ALL) (C),9 (C),0.752,0.936,0.938,0.983,6
ENG,B,bert-base-cased (C) (M),3 (M),0.727,0.935,0.938,0.983,7
ENG,B,bert-base-uncased,2,0.737,0.991,0.938,0.998,8
ENG,B,bert-base-multilingual-uncased (ALL),9,0.751,0.987,0.937,0.996,9
ENG,B,xlm-roberta-base,5,0.734,0.915,0.936,0.978,10


subtask=B, lang=HIN


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train,Unnamed: 8_level_1
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
HIN,B,bert-base-multilingual-uncased,6,0.78,0.974,0.891,0.986,1
HIN,B,bert-base-multilingual-uncased (ALL),9,0.778,0.99,0.888,0.994,2
HIN,B,bert-base-multilingual-uncased (ALL) (C),9 (C),0.783,0.932,0.888,0.962,3
HIN,B,bert-base-multilingual-uncased (ALL) (M),9 (M),0.778,0.931,0.886,0.962,4
HIN,B,bert-base-multilingual-uncased (C) (M),3 (M),0.76,0.844,0.882,0.916,5
HIN,B,bert-base-multilingual-uncased (C) (C),3 (C),0.75,0.847,0.874,0.917,6
HIN,B,xlm-roberta-base (ALL),9,0.745,0.831,0.87,0.909,7
HIN,B,xlm-roberta-base,2,0.459,0.455,0.778,0.759,9
HIN,B,xlm-roberta-base (C) (C),4 (C),0.459,0.455,0.778,0.759,9
HIN,B,xlm-roberta-base (C) (M),4 (M),0.459,0.455,0.778,0.759,9


subtask=B, lang=IBEN


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train,Unnamed: 8_level_1
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
IBEN,B,bert-base-multilingual-uncased (ALL),9,0.849,0.987,0.905,0.992,1
IBEN,B,bert-base-multilingual-uncased (ALL) (M),9 (M),0.849,0.943,0.904,0.965,2
IBEN,B,bert-base-multilingual-uncased (ALL) (C),9 (C),0.846,0.943,0.902,0.966,3
IBEN,B,bert-base-multilingual-uncased,6,0.83,0.975,0.894,0.985,4
IBEN,B,bert-base-multilingual-uncased (C) (M),3 (M),0.827,0.924,0.892,0.954,5
IBEN,B,bert-base-multilingual-uncased (C) (C),3 (C),0.824,0.923,0.89,0.953,6
IBEN,B,xlm-roberta-base (ALL),9,0.792,0.845,0.873,0.908,7
IBEN,B,xlm-roberta-base (C) (M),4 (M),0.783,0.835,0.869,0.903,8
IBEN,B,xlm-roberta-base (C) (C),4 (C),0.783,0.833,0.868,0.902,9
IBEN,B,xlm-roberta-base,2,0.714,0.743,0.83,0.855,10


In [15]:
with pd.option_context("precision", 3, "max_rows", 100):
    df_t = pd.concat([df_map[k][["lang", "task", "model", "run_id", "split", "macro avg/f1-score", "weighted avg/f1-score"]].rename(columns={
            "macro avg/f1-score": "macro", "weighted avg/f1-score": "weighted"
        }) for k in ["A", "B"]])
    df_t = df_t.pivot_table(index=["task", "lang", "model", "run_id"], columns=["split"])
    display(df_t)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train
task,lang,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A,ENG,bert-base-cased,1,0.563,0.966,0.774,0.987
A,ENG,bert-base-cased (C) (C),3 (C),0.571,0.912,0.786,0.961
A,ENG,bert-base-cased (C) (M),3 (M),0.568,0.908,0.782,0.96
A,ENG,bert-base-multilingual-uncased (ALL),9,0.611,0.903,0.798,0.957
A,ENG,bert-base-multilingual-uncased (ALL) (C),9 (C),0.55,0.871,0.778,0.941
A,ENG,bert-base-multilingual-uncased (ALL) (M),9 (M),0.555,0.865,0.78,0.939
A,ENG,bert-base-uncased,2,0.577,0.948,0.784,0.979
A,ENG,bert-base-uncased (C) (C),4 (C),0.596,0.902,0.795,0.956
A,ENG,bert-base-uncased (C) (M),4 (M),0.595,0.9,0.795,0.956
A,ENG,xlm-roberta-base,5,0.531,0.676,0.772,0.862


In [16]:
with pd.option_context("precision", 3, "max_rows", 100):
    df_t = pd.concat([df_map[k][["lang", "task", "model", "run_id", "split", "macro avg/f1-score", "weighted avg/f1-score"]].rename(columns={
            "macro avg/f1-score": "macro", "weighted avg/f1-score": "weighted"
        }) for k in ["A", "B"]])
    df_t = df_t.pivot_table(index=["lang", "task", "model", "run_id"], columns=["split"])
    display(df_t)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,macro,macro,weighted,weighted
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,split,dev,train,dev,train
lang,task,model,run_id,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ENG,A,bert-base-cased,1,0.563,0.966,0.774,0.987
ENG,A,bert-base-cased (C) (C),3 (C),0.571,0.912,0.786,0.961
ENG,A,bert-base-cased (C) (M),3 (M),0.568,0.908,0.782,0.96
ENG,A,bert-base-multilingual-uncased (ALL),9,0.611,0.903,0.798,0.957
ENG,A,bert-base-multilingual-uncased (ALL) (C),9 (C),0.55,0.871,0.778,0.941
ENG,A,bert-base-multilingual-uncased (ALL) (M),9 (M),0.555,0.865,0.78,0.939
ENG,A,bert-base-uncased,2,0.577,0.948,0.784,0.979
ENG,A,bert-base-uncased (C) (C),4 (C),0.596,0.902,0.795,0.956
ENG,A,bert-base-uncased (C) (M),4 (M),0.595,0.9,0.795,0.956
ENG,A,xlm-roberta-base,5,0.531,0.676,0.772,0.862
