In [1]:
from typing import Any
import json
import pandas as pd
PHENOMENA_PATH_PREFIX = "../scripts/metrics_by_phenomena/results/"
QUANTIFIER_PATH_PREFIX = "../scripts/analyze_quantifiers/results/"

In [2]:
def load_model_metrics(name: str, path: str) -> Any:
    with open(path, "r") as f:
        data = json.loads(f.read())
        return {"name": name, "metrics": data}

In [3]:
models = [
    ("default", "roberta-base-finetuned-mnli.json"),
    ("hypothesis-only", "roberta-base-finetuned-mnli-hypothesis-only.json"),
    ("maximum-two", "roberta-base-finetuned-mnli-maximum_two_6.json")
]
phenomena_models = [load_model_metrics(name, PHENOMENA_PATH_PREFIX + path) for name, path in models]
quantifier_models = [load_model_metrics(name, QUANTIFIER_PATH_PREFIX + path) for name, path in models]

In [9]:
def sort_model(key, model):
    model["metrics"].sort(key=lambda x: x[key], reverse=True)
    return model

def construct_table_for_metric(metric_name, metric_key, models):
    sorted_models = [sort_model(metric_key, model) for model in models]
    pandas_dict = {model["name"]: [
            (
                entry["task_name"],
                round(entry[metric_key], 3)
            )
            for entry in model["metrics"]
        ] for model in models}
    df = pd.DataFrame({k:pd.Series(v) for k,v in pandas_dict.items()})
    df.name = metric_name
    return df
    

In [11]:
def calculate_metrics(models):
    return [ construct_table_for_metric(metric_name, metric_key, models) for metric_name, metric_key in [
        ("accuracy", "accuracy"),
        ("matthews correlation coefficient", "matthews_correlation"),
        ("macro f1", "f1")
    ]]

In [12]:
phenomena_metrics = calculate_metrics(phenomena_models)
quantifier_metrics = calculate_metrics(quantifier_models)

In [13]:
from IPython.core.display import display, HTML

def display_side_by_side(dfs):
    output = ""
    #df_dicts = dict(dfs)
    for df in dfs:
        output += df.style.set_table_attributes("style='display:inline'").set_caption(df.name)._repr_html_()
        output += "<br><br>"
    display(HTML(output))

In [16]:
display_side_by_side(
    phenomena_metrics
)

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('antonym', 0.927)","('hypernym', 0.598)","('antonym', 0.925)"
1,"('synonym', 0.868)","('quantifiers', 0.591)","('synonym', 0.843)"
2,"('co_hyponym', 0.867)","('hyponym', 0.573)","('co_hyponym', 0.835)"
3,"('hypernym', 0.867)","('synonym', 0.551)","('hypernym', 0.82)"
4,"('hyponym', 0.858)","('numericals', 0.526)","('hyponym', 0.816)"
5,"('quantifiers', 0.841)","('co_hyponym', 0.503)","('quantifiers', 0.795)"
6,"('numericals', 0.823)","('antonym', 0.392)","('numericals', 0.781)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('antonym', 0.796)","('quantifiers', 0.414)","('antonym', 0.782)"
1,"('co_hyponym', 0.76)","('co_hyponym', 0.181)","('co_hyponym', 0.716)"
2,"('quantifiers', 0.748)","('synonym', 0.169)","('synonym', 0.697)"
3,"('synonym', 0.733)","('antonym', 0.166)","('quantifiers', 0.689)"
4,"('hyponym', 0.698)","('numericals', 0.161)","('numericals', 0.638)"
5,"('hypernym', 0.69)","('hyponym', 0.158)","('hyponym', 0.63)"
6,"('numericals', 0.69)","('hypernym', 0.155)","('hypernym', 0.619)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('quantifiers', 0.775)","('quantifiers', 0.538)","('quantifiers', 0.718)"
1,"('numericals', 0.728)","('numericals', 0.396)","('numericals', 0.686)"
2,"('antonym', 0.62)","('hypernym', 0.367)","('antonym', 0.619)"
3,"('co_hyponym', 0.6)","('synonym', 0.367)","('co_hyponym', 0.588)"
4,"('synonym', 0.592)","('hyponym', 0.364)","('synonym', 0.582)"
5,"('hyponym', 0.58)","('co_hyponym', 0.358)","('hypernym', 0.556)"
6,"('hypernym', 0.576)","('antonym', 0.282)","('hyponym', 0.555)"


In [17]:
display_side_by_side(
    quantifier_metrics
)

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('no', 1.0)","('no', 1.0)","('no', 1.0)"
1,"('each', 1.0)","('much', 1.0)","('much', 1.0)"
2,"('much', 1.0)","('each', 1.0)","('each', 1.0)"
3,"('some', 0.9)","('many', 0.75)","('whole', 1.0)"
4,"('all', 0.857)","('some', 0.75)","('some', 0.9)"
5,"('several', 0.833)","('all', 0.75)","('all', 0.857)"
6,"('many', 0.833)","('several', 0.667)","('many', 0.833)"
7,"('few', 0.8)","('few', 0.5)","('few', 0.6)"
8,"('any', 0.5)",,"('several', 0.5)"
9,"('whole', 0.0)",,"('any', 0.5)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('each', 1.0)","('all', 0.7)","('each', 1.0)"
1,"('all', 0.802)","('many', 0.577)","('all', 0.802)"
2,"('several', 0.783)","('several', 0.5)","('some', 0.728)"
3,"('some', 0.728)","('no', 0.0)","('many', 0.632)"
4,"('many', 0.671)","('much', 0.0)","('few', 0.53)"
5,"('few', 0.661)","('each', 0.0)","('any', 0.5)"
6,"('any', 0.5)","('some', 0.0)","('several', 0.318)"
7,"('no', 0.0)","('few', -0.167)","('no', 0.0)"
8,"('much', 0.0)",,"('much', 0.0)"
9,"('whole', 0.0)",,"('whole', 0.0)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('each', 1.0)","('no', 1.0)","('each', 1.0)"
1,"('no', 1.0)","('much', 1.0)","('no', 1.0)"
2,"('much', 1.0)","('each', 1.0)","('much', 1.0)"
3,"('all', 0.841)","('all', 0.778)","('whole', 1.0)"
4,"('several', 0.822)","('many', 0.733)","('all', 0.841)"
5,"('many', 0.63)","('several', 0.667)","('many', 0.778)"
6,"('few', 0.619)","('some', 0.429)","('some', 0.556)"
7,"('some', 0.556)","('few', 0.222)","('few', 0.556)"
8,"('any', 0.333)",,"('several', 0.433)"
9,"('whole', 0.0)",,"('any', 0.333)"
