In [53]:
from typing import Any
import json
import pandas as pd
PATH_PREFIX = "../scripts/metrics_by_phenomena/results/"

In [54]:
def load_model_metrics(name: str, path: str) -> Any:
    with open(path, "r") as f:
        data = json.loads(f.read())
        return {"name": name, "metrics": data}

In [55]:
models = [load_model_metrics(name, PATH_PREFIX + path) for name, path in [
    ("default", "roberta-base-finetuned-mnli.json"),
    ("hypothesis-only", "roberta-base-finetuned-mnli-hypothesis-only.json"),
    ("maximum-two", "roberta-base-finetuned-mnli-maximum_two_6.json")
]]

In [60]:
def sort_model(key, model):
    model["metrics"].sort(key=lambda x: x[key], reverse=True)
    return model

def construct_table_for_metric(metric_name, metric_key, models):
    sorted_models = [sort_model(metric_key, model) for model in models]
    df = pd.DataFrame(
        {model["name"]: [
            (
                entry["task_name"],
                round(entry[metric_key], 3)
            )
            for entry in model["metrics"]
        ] for model in models},
    )
    df.name = metric_name
    return df
    

In [61]:
metrics = [ construct_table_for_metric(metric_name, metric_key, models) for metric_name, metric_key in [
    ("accuracy", "accuracy"),
    ("matthews correlation coefficient", "matthews_correlation"),
    ("macro f1", "f1")
]]

In [62]:
from IPython.core.display import display, HTML

def display_side_by_side(dfs):
    output = ""
    #df_dicts = dict(dfs)
    for df in dfs:
        output += df.style.set_table_attributes("style='display:inline'").set_caption(df.name)._repr_html_()
        output += "<br><br>"
    display(HTML(output))

In [63]:
display_side_by_side(
    metrics
)

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('antonym', 0.927)","('hypernym', 0.598)","('antonym', 0.925)"
1,"('synonym', 0.868)","('quantifiers', 0.591)","('synonym', 0.843)"
2,"('co_hyponym', 0.867)","('hyponym', 0.573)","('co_hyponym', 0.835)"
3,"('hypernym', 0.867)","('synonym', 0.551)","('hypernym', 0.82)"
4,"('hyponym', 0.858)","('numericals', 0.526)","('hyponym', 0.816)"
5,"('quantifiers', 0.841)","('co_hyponym', 0.503)","('quantifiers', 0.795)"
6,"('numericals', 0.823)","('antonym', 0.392)","('numericals', 0.781)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('antonym', 0.796)","('quantifiers', 0.414)","('antonym', 0.782)"
1,"('co_hyponym', 0.76)","('co_hyponym', 0.181)","('co_hyponym', 0.716)"
2,"('quantifiers', 0.748)","('synonym', 0.169)","('synonym', 0.697)"
3,"('synonym', 0.733)","('antonym', 0.166)","('quantifiers', 0.689)"
4,"('hyponym', 0.698)","('numericals', 0.161)","('numericals', 0.638)"
5,"('hypernym', 0.69)","('hyponym', 0.158)","('hyponym', 0.63)"
6,"('numericals', 0.69)","('hypernym', 0.155)","('hypernym', 0.619)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('quantifiers', 0.775)","('quantifiers', 0.538)","('quantifiers', 0.718)"
1,"('numericals', 0.728)","('numericals', 0.396)","('numericals', 0.686)"
2,"('antonym', 0.62)","('hypernym', 0.367)","('antonym', 0.619)"
3,"('co_hyponym', 0.6)","('synonym', 0.367)","('co_hyponym', 0.588)"
4,"('synonym', 0.592)","('hyponym', 0.364)","('synonym', 0.582)"
5,"('hyponym', 0.58)","('co_hyponym', 0.358)","('hypernym', 0.556)"
6,"('hypernym', 0.576)","('antonym', 0.282)","('hyponym', 0.555)"
