In [74]:
from typing import Any
import json
import pandas as pd
PHENOMENA_PATH_PREFIX = "../scripts/metrics_by_phenomena/results/"
QUANTIFIER_PATH_PREFIX = "../scripts/analyze_quantifiers/results/"

In [75]:
def load_model_metrics(name: str, path: str) -> Any:
    with open(path, "r") as f:
        data = json.loads(f.read())
        return {"name": name, "metrics": data}

In [76]:
models = [
    ("default", "roberta-base-finetuned-mnli.json"),
    ("hypothesis-only", "roberta-base-finetuned-mnli-hypothesis-only.json"),
    ("maximum-two", "roberta-base-finetuned-mnli-maximum_two_6.json")
]
phenomena_models = [load_model_metrics(name, PHENOMENA_PATH_PREFIX + path) for name, path in models]
quantifier_models = [load_model_metrics(name, QUANTIFIER_PATH_PREFIX + path) for name, path in models]

In [77]:
def sort_model(key, model):
    model["metrics"].sort(key=lambda x: x[key], reverse=True)
    return model

def construct_table_for_metric(metric_name, metric_key, models):
    sorted_models = [sort_model(metric_key, model) for model in models]
    pandas_dict = {model["name"]: [
            (
                entry["task_name"],
                round(entry[metric_key], 3)
            )
            for entry in model["metrics"]
        ] for model in models}
    df = pd.DataFrame({k:pd.Series(v) for k,v in pandas_dict.items()})
    df.name = metric_name
    return df
    

In [78]:
def calculate_metrics(models):
    return [ construct_table_for_metric(metric_name, metric_key, models) for metric_name, metric_key in [
        ("accuracy", "accuracy"),
        ("matthews correlation coefficient", "matthews_correlation"),
        ("macro f1", "f1")
    ]]

In [79]:
phenomena_metrics = calculate_metrics(phenomena_models)
quantifier_metrics = calculate_metrics(quantifier_models)

In [80]:
from IPython.core.display import display, HTML

def display_side_by_side(dfs):
    output = ""
    #df_dicts = dict(dfs)
    for df in dfs:
        output += df.style.set_table_attributes("style='display:inline'").set_caption(df.name)._repr_html_()
        output += "<br><br>"
    display(HTML(output))

In [81]:
display_side_by_side(
    phenomena_metrics
)

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('antonym', 0.927)","('hypernym', 0.598)","('antonym', 0.925)"
1,"('synonym', 0.868)","('quantifiers', 0.591)","('synonym', 0.843)"
2,"('co_hyponym', 0.867)","('hyponym', 0.573)","('co_hyponym', 0.835)"
3,"('hypernym', 0.867)","('synonym', 0.551)","('hypernym', 0.82)"
4,"('hyponym', 0.858)","('numericals', 0.526)","('hyponym', 0.816)"
5,"('quantifiers', 0.841)","('co_hyponym', 0.503)","('quantifiers', 0.795)"
6,"('numericals', 0.823)","('antonym', 0.392)","('numericals', 0.781)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('antonym', 0.796)","('quantifiers', 0.414)","('antonym', 0.782)"
1,"('co_hyponym', 0.76)","('co_hyponym', 0.181)","('co_hyponym', 0.716)"
2,"('quantifiers', 0.748)","('synonym', 0.169)","('synonym', 0.697)"
3,"('synonym', 0.733)","('antonym', 0.166)","('quantifiers', 0.689)"
4,"('hyponym', 0.698)","('numericals', 0.161)","('numericals', 0.638)"
5,"('hypernym', 0.69)","('hyponym', 0.158)","('hyponym', 0.63)"
6,"('numericals', 0.69)","('hypernym', 0.155)","('hypernym', 0.619)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('quantifiers', 0.775)","('quantifiers', 0.538)","('quantifiers', 0.718)"
1,"('numericals', 0.728)","('numericals', 0.396)","('numericals', 0.686)"
2,"('antonym', 0.62)","('hypernym', 0.367)","('antonym', 0.619)"
3,"('co_hyponym', 0.6)","('synonym', 0.367)","('co_hyponym', 0.588)"
4,"('synonym', 0.592)","('hyponym', 0.364)","('synonym', 0.582)"
5,"('hyponym', 0.58)","('co_hyponym', 0.358)","('hypernym', 0.556)"
6,"('hypernym', 0.576)","('antonym', 0.282)","('hyponym', 0.555)"


In [82]:
display_side_by_side(
    quantifier_metrics
)

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('no', 1.0)","('no', 1.0)","('no', 1.0)"
1,"('each', 1.0)","('much', 1.0)","('much', 1.0)"
2,"('much', 1.0)","('each', 1.0)","('each', 1.0)"
3,"('some', 0.9)","('many', 0.75)","('whole', 1.0)"
4,"('all', 0.857)","('some', 0.75)","('some', 0.9)"
5,"('several', 0.833)","('all', 0.75)","('all', 0.857)"
6,"('many', 0.833)","('several', 0.667)","('many', 0.833)"
7,"('few', 0.8)","('few', 0.5)","('few', 0.6)"
8,"('any', 0.5)",,"('several', 0.5)"
9,"('whole', 0.0)",,"('any', 0.5)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('each', 1.0)","('all', 0.7)","('each', 1.0)"
1,"('all', 0.802)","('many', 0.577)","('all', 0.802)"
2,"('several', 0.783)","('several', 0.5)","('some', 0.728)"
3,"('some', 0.728)","('no', 0.0)","('many', 0.632)"
4,"('many', 0.671)","('much', 0.0)","('few', 0.53)"
5,"('few', 0.661)","('each', 0.0)","('any', 0.5)"
6,"('any', 0.5)","('some', 0.0)","('several', 0.318)"
7,"('no', 0.0)","('few', -0.167)","('no', 0.0)"
8,"('much', 0.0)",,"('much', 0.0)"
9,"('whole', 0.0)",,"('whole', 0.0)"

Unnamed: 0,default,hypothesis-only,maximum-two
0,"('each', 1.0)","('no', 1.0)","('each', 1.0)"
1,"('no', 1.0)","('much', 1.0)","('no', 1.0)"
2,"('much', 1.0)","('each', 1.0)","('much', 1.0)"
3,"('all', 0.841)","('all', 0.778)","('whole', 1.0)"
4,"('several', 0.822)","('many', 0.733)","('all', 0.841)"
5,"('many', 0.63)","('several', 0.667)","('many', 0.778)"
6,"('few', 0.619)","('some', 0.429)","('some', 0.556)"
7,"('some', 0.556)","('few', 0.222)","('few', 0.556)"
8,"('any', 0.333)",,"('several', 0.433)"
9,"('whole', 0.0)",,"('any', 0.333)"


# Bias by Ferret metrics

In [83]:
from datasets import load_from_disk
import multiprocessing
from functools import reduce

In [84]:
PHENOMENA = ["synonym", "antonym", "hypernym", "hyponym", "co_hyponym", "quantifiers", "numericals"]
# Notebooks run on last, so path is also on last
DATASET_PATH = "/mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena"

In [85]:
dataset = load_from_disk(DATASET_PATH)

In [86]:
def construct_metrics_by_phenomenon(dataset, phenomenon):
    cpu_count = multiprocessing.cpu_count()
    def phenomenon_filter(record):
        return record[phenomenon] > 0
    return dataset.filter(phenomenon_filter, num_proc=cpu_count).to_pandas()["evaluations"]
    

def construct_phenomena_metrics(dataset, phenomena):
    return {phenomenon: construct_metrics_by_phenomenon(dataset, phenomenon) for phenomenon in phenomena}

def construct_phenomena_dfs(metrics):
    def bake_df(series, name):
        df = series.to_frame()
        df.name = name
        return df.evaluations
    return [bake_df(series, name) for name, series in metrics.items()]

In [87]:
phenomena_metrics = construct_phenomena_metrics(dataset, PHENOMENA)
phenomena_metrics = construct_phenomena_dfs(phenomena_metrics)

Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-cbeaf634725dc3b8_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-909255f04b113caa_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-9d06c1fb94f66ed4_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-9ebba56b915324d6_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-68ad2491d38d6f59_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-4b409e1145f63001_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/es

In [88]:
EXPLAINERS = ["shap", "lime", "integrated gradient", "integrated gradient multiply by inputs"]
EVALUATORS = ["comprehensiveness", "sufficency", "tauloo", "auprc plausibility", "f1 plausibility", "iou plausibility"]

In [89]:
def add_record_labels(records, explainer_labels, evaluator_labels):
    def add_evaluation_labels(explanation):
        return {name: evaluation for name, evaluation in zip(evaluator_labels, explanation)}
    def add_explainer_labels(record):
        return {name: add_evaluation_labels(explanation) for name, explanation in zip(explainer_labels, record)}
    return [add_explainer_labels(record) for record in records]


# Calculate sum, then divide by len, so that I do not need to read every
def record_mean(records):
    item_count = len(records)
    def sum_records(r1, r2):
        return [
            [
                r1_explanation + r2_explanation
                for r1_explanation, r2_explanation in zip(r1_evaluation, r2_evaluation)
            ]
            for r1_evaluation, r2_evaluation in zip(r1, r2)
        ]
    def finish_mean(input):
        return [
            [
                explanation / item_count
                for explanation in evaluation
            ]
            for evaluation in input
        ]
    aggregated = reduce(sum_records, records)
    return finish_mean(aggregated)

In [90]:
records_mean = [record_mean(phenomena) for phenomena in phenomena_metrics]
labeled_df = add_record_labels(records_mean, EXPLAINERS, EVALUATORS)

In [91]:
def phenomena_label(records):
    return {name: record for name, record in zip(PHENOMENA, records)}

In [92]:
labeled_dict = phenomena_label(labeled_df)

## Tables for dataset evaluated on "normal" model

In [93]:
def another_df(phenomenon, item):
    df = pd.DataFrame({k:pd.Series(v) for k,v in item.items()})
    df.name = phenomenon
    return df

display_side_by_side(
    [another_df(phenomenon, single) for phenomenon, single in labeled_dict.items()]
)

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.045385,0.142062,-0.017277,0.098762
sufficency,-0.056844,-0.093129,0.015423,-0.055451
tauloo,0.121798,0.558818,-0.023015,0.289046
auprc plausibility,0.478986,0.520321,0.526057,0.523723
f1 plausibility,0.409754,0.410998,0.410141,0.406507
iou plausibility,0.288822,0.294774,0.292376,0.294283

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.01456,0.123344,-0.019886,0.080247
sufficency,-0.091476,-0.142181,-0.056204,-0.036715
tauloo,0.098046,0.525571,-0.011362,0.263269
auprc plausibility,0.43858,0.49464,0.434257,0.412053
f1 plausibility,0.380601,0.375742,0.338224,0.25502
iou plausibility,0.273271,0.276075,0.236991,0.181157

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.045406,0.139272,-0.020098,0.092175
sufficency,-0.044437,-0.077941,0.021151,-0.04535
tauloo,0.097473,0.54935,-0.033855,0.270599
auprc plausibility,0.458583,0.493381,0.527101,0.500297
f1 plausibility,0.404937,0.391462,0.412855,0.391862
iou plausibility,0.283506,0.27788,0.292826,0.279647

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.042821,0.136969,-0.026162,0.092021
sufficency,-0.057821,-0.085882,0.012683,-0.053428
tauloo,0.110015,0.55547,-0.034106,0.275915
auprc plausibility,0.467752,0.500678,0.515573,0.500818
f1 plausibility,0.405602,0.38789,0.400477,0.390504
iou plausibility,0.283471,0.275118,0.282823,0.281134

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.041212,0.140604,-0.017974,0.095573
sufficency,-0.06817,-0.101887,-0.001706,-0.051876
tauloo,0.123407,0.555443,-0.025535,0.279662
auprc plausibility,0.485756,0.527076,0.501264,0.509717
f1 plausibility,0.413985,0.410964,0.392822,0.385621
iou plausibility,0.295409,0.297623,0.278937,0.280322

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.042424,0.201789,0.018198,0.1779
sufficency,0.075174,-0.07075,0.053095,-0.051425
tauloo,0.079961,0.562259,0.022459,0.384447
auprc plausibility,0.460482,0.490479,0.439264,0.482057
f1 plausibility,0.306543,0.387419,0.338162,0.411906
iou plausibility,0.204578,0.266609,0.227581,0.288774

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.050509,0.154334,-0.007114,0.1143
sufficency,-0.032625,-0.077805,0.033559,-0.042137
tauloo,0.138442,0.570633,-0.024554,0.289484
auprc plausibility,0.507678,0.532074,0.50384,0.520542
f1 plausibility,0.402498,0.419599,0.390652,0.404821
iou plausibility,0.287474,0.303505,0.276689,0.294604


In [94]:
HYPOTHESIS_ONLY_DATASET_PATH = "/mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena"

In [95]:
dataset = load_from_disk(HYPOTHESIS_ONLY_DATASET_PATH)
phenomena_metrics = construct_phenomena_metrics(dataset, PHENOMENA)
phenomena_metrics = construct_phenomena_dfs(phenomena_metrics)
records_mean = [record_mean(phenomena) for phenomena in phenomena_metrics]
labeled_df = add_record_labels(records_mean, EXPLAINERS, EVALUATORS)
labeled_dict = phenomena_label(labeled_df)

Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-cbeaf634725dc3b8_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-909255f04b113caa_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-9d06c1fb94f66ed4_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-9ebba56b915324d6_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-68ad2491d38d6f59_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/esnli_evaluations_hypothesis_only_42_phenomena/cache-4b409e1145f63001_*_of_00040.arrow
Loading cached processed dataset at /mnt/semproj/sem_proj22/proj_05/es

## Tables for dataset evaluated on hypothesis only model

In [96]:
display_side_by_side(
    [another_df(phenomenon, single) for phenomenon, single in labeled_dict.items()]
)

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.045385,0.142062,-0.017277,0.098762
sufficency,-0.056844,-0.093129,0.015423,-0.055451
tauloo,0.121798,0.558818,-0.023015,0.289046
auprc plausibility,0.478986,0.520321,0.526057,0.523723
f1 plausibility,0.409754,0.410998,0.410141,0.406507
iou plausibility,0.288822,0.294774,0.292376,0.294283

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.01456,0.123344,-0.019886,0.080247
sufficency,-0.091476,-0.142181,-0.056204,-0.036715
tauloo,0.098046,0.525571,-0.011362,0.263269
auprc plausibility,0.43858,0.49464,0.434257,0.412053
f1 plausibility,0.380601,0.375742,0.338224,0.25502
iou plausibility,0.273271,0.276075,0.236991,0.181157

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.045406,0.139272,-0.020098,0.092175
sufficency,-0.044437,-0.077941,0.021151,-0.04535
tauloo,0.097473,0.54935,-0.033855,0.270599
auprc plausibility,0.458583,0.493381,0.527101,0.500297
f1 plausibility,0.404937,0.391462,0.412855,0.391862
iou plausibility,0.283506,0.27788,0.292826,0.279647

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.042821,0.136969,-0.026162,0.092021
sufficency,-0.057821,-0.085882,0.012683,-0.053428
tauloo,0.110015,0.55547,-0.034106,0.275915
auprc plausibility,0.467752,0.500678,0.515573,0.500818
f1 plausibility,0.405602,0.38789,0.400477,0.390504
iou plausibility,0.283471,0.275118,0.282823,0.281134

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.041212,0.140604,-0.017974,0.095573
sufficency,-0.06817,-0.101887,-0.001706,-0.051876
tauloo,0.123407,0.555443,-0.025535,0.279662
auprc plausibility,0.485756,0.527076,0.501264,0.509717
f1 plausibility,0.413985,0.410964,0.392822,0.385621
iou plausibility,0.295409,0.297623,0.278937,0.280322

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.042424,0.201789,0.018198,0.1779
sufficency,0.075174,-0.07075,0.053095,-0.051425
tauloo,0.079961,0.562259,0.022459,0.384447
auprc plausibility,0.460482,0.490479,0.439264,0.482057
f1 plausibility,0.306543,0.387419,0.338162,0.411906
iou plausibility,0.204578,0.266609,0.227581,0.288774

Unnamed: 0,shap,lime,integrated gradient,integrated gradient multiply by inputs
comprehensiveness,0.050509,0.154334,-0.007114,0.1143
sufficency,-0.032625,-0.077805,0.033559,-0.042137
tauloo,0.138442,0.570633,-0.024554,0.289484
auprc plausibility,0.507678,0.532074,0.50384,0.520542
f1 plausibility,0.402498,0.419599,0.390652,0.404821
iou plausibility,0.287474,0.303505,0.276689,0.294604
