In [1]:
import re

from pathlib import Path

import numpy as np
import pandas as pd


from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

In [8]:
def get_costs(file_paths):
    costs = []
    for index, cost_path in enumerate(file_paths):
        dataset, algorithm, k = re.findall(r"/.+/(.+)/(.+)-k(\d+)-", str(cost_path))[0]
        with open(cost_path, "r") as f:
            weighted_cost = float(f.read())
        costs.append({
            "dataset": dataset,
            "algorithm": algorithm,
            "k": k,
            "cost": weighted_cost,
            "file_path": str(cost_path)
        })
    return pd.DataFrame(costs)

def display_distortion_table(dataset: str):
    data_results_dir = Path(f"../data/results/experiments/{dataset}/")
    
    # Real costs
    real_cost_paths = list(data_results_dir.glob("**/real_cost.txt"))
    df_real_costs = get_costs(real_cost_paths)
    df_aggr_real_costs = df_real_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
        real_cost_count=("cost", "count"),
        real_cost_mean=("cost", "mean"),
        real_cost_std=("cost", "std")
    )
    df_aggr_real_costs['k'] = df_aggr_real_costs['k'].astype(int)
    
    # Coreset costs
    coreset_cost_paths = list(data_results_dir.glob("**/coreset_cost.txt"))
    df_coreset_costs = get_costs(coreset_cost_paths)
    df_aggr_coreset_costs = df_coreset_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
        coreset_cost_count=("cost", "count"),
        coreset_cost_mean=("cost", "mean"),
        coreset_cost_std=("cost", "std")
    )
    df_aggr_coreset_costs['k'] = df_aggr_coreset_costs['k'].astype(int)
    
    df_coreset = pd.pivot_table(df_aggr_coreset_costs, values="coreset_cost_mean", index=["algorithm"], columns=["k"])
    df_real = pd.pivot_table(df_aggr_real_costs, values="real_cost_mean", index=["algorithm"], columns=["k"])
    df_distortions = df_real / df_coreset
    display(df_distortions.style.format(precision=4, thousands=","))
    # display(HTML(f"<strong>Number of experiments</strong>"))
    # df_real_cost_count = pd.pivot_table(df_aggr_real_costs, values="real_cost_count", index=["algorithm"], columns=["k"])
    # display(df_real_cost_count)

In [9]:
for dataset in ["census", "covertype", "tower"]:
    display(HTML(f"<h3>Distortion for {dataset}</h3>"))
    display_distortion_table(dataset)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,1.6473,1.6884,1.7507,1.8183,1.8563
group-sampling,1.0334,1.0338,1.03,1.0296,1.0308
sensitivity-sampling,1.0054,1.004,1.0051,1.0065,1.0062


k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20


k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,1.1041,1.1116,1.102,1.0886,1.0736
group-sampling,1.0472,1.0391,1.0375,1.0398,1.0384
sensitivity-sampling,1.0207,1.0194,1.0184,1.0202,1.0189


k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20


k,20,40,60,80,100
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,1.0626,1.0635,1.0576,1.0502,1.0422
group-sampling,1.0395,1.0416,1.0449,1.0448,1.0463
sensitivity-sampling,1.0186,1.0169,1.0202,1.0174,1.0182


k,20,40,60,80,100
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,19,19,19,19,19
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20
