# Summarizing all evaluations

In [8]:
from collections import defaultdict
from itertools import product
from glob import glob
import os
import json
import pandas as pd
import numpy as np

combined_metrics = {}
for metrics_file in glob("*/model_metrics.json"):
    experiment_name = os.path.dirname(metrics_file).split(os.path.sep)[0]
    combined_metrics[experiment_name] = {}
    with open(metrics_file, "r", encoding="utf-8") as file:
        metrics = json.load(file)

    # collect all in a proper dict
    for run_id, run in metrics.items():
        for model, scores in run.items():
            if model not in combined_metrics[experiment_name]:
                combined_metrics[experiment_name][model] = defaultdict(list)
            for score_name, score in scores.items():
                combined_metrics[experiment_name][model][score_name].append(score)


# Avg + Med + max. abs. diff for both
flattened_summary = defaultdict(dict)
for experiment_name, experiment in combined_metrics.items():
    for model, scores in experiment.items():
        for score_name in scores:
            score = np.asarray(scores[score_name])
            avg_score = np.mean(score)
            median_score = np.median(score)
            abs_diff_avg_score = np.max(np.abs(score - avg_score))
            abs_diff_median_score = np.max(np.abs(score - median_score))
            flattened_summary[(model, score_name)].update({experiment_name: (avg_score, abs_diff_avg_score, median_score, abs_diff_median_score)})



index = []
columns = list(product(combined_metrics.keys(), ["avg", "avg max diff", "median", "median max diff"]))

data = []

for idx in flattened_summary:
    index.append(idx)
    array = np.asarray([
        item
        for experiment_name in combined_metrics
        for item in flattened_summary[idx][experiment_name]
    ])
    data.append(array)

data = np.asarray(data)

df = pd.DataFrame.from_records(data, index=pd.MultiIndex.from_tuples(index), columns=pd.MultiIndex.from_tuples(columns))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,reranking,reranking,reranking,reranking,bm25,bm25,bm25,bm25,ann,ann,ann,ann
Unnamed: 0_level_1,Unnamed: 1_level_1,avg,avg max diff,median,median max diff,avg,avg max diff,median,median max diff,avg,avg max diff,median,median max diff
baseline,train_f1,0.844073,0.024083,0.835985,0.032171,0.849711,0.018343,0.85869,0.027322,0.848881,0.02445,0.856206,0.031774
baseline,dev_f1,0.564815,0.022588,0.555066,0.032337,0.57285,0.01343,0.573487,0.014067,0.569953,0.023032,0.573285,0.026364
gold,train_f1,0.852019,0.007096,0.853914,0.008991,0.856711,0.015042,0.849459,0.022293,0.843784,0.006803,0.840966,0.009621
gold,dev_f1,0.550547,0.009215,0.554513,0.013181,0.558792,0.007055,0.558346,0.007501,0.55382,0.007959,0.554572,0.008711
error,train_f1,0.846218,0.007123,0.845479,0.007861,0.846877,0.008081,0.843741,0.011216,0.849112,0.00061,0.849398,0.000896
error,dev_f1,0.551406,0.019199,0.549889,0.020716,0.549039,0.004829,0.548844,0.005024,0.555109,0.004299,0.557164,0.006353


In [11]:
df.loc[:, "avg", :, "dev_f1"]

KeyError: 'avg'