In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import re

from pathlib import Path

import numpy as np
import pandas as pd

from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

from xrun.data.run_info import RunInfo

In [3]:
def load_cost_from_file(file_path: Path):
    if file_path.exists():
            with open(file_path, "r") as f:
                return float(f.read())
    else:
        print(f"Warning: {file_path} not found!")
    return None

def get_costs(file_paths):
    costs = []
    for index, run_file_path in enumerate(file_paths):
        run_info = RunInfo.load_json(run_file_path)
        real_cost = load_cost_from_file(run_file_path.parent / "real_cost.txt")
        coreset_cost = load_cost_from_file(run_file_path.parent / "coreset_cost.txt")
        costs.append({
            "dataset": run_info.dataset,
            "algorithm": run_info.algorithm,
            "k": run_info.k,
            "running_time": int(run_info.duration_secs),
            "real_cost": real_cost,
            "coreset_cost": coreset_cost,
            "run_file_path": str(run_file_path),
        })
    return pd.DataFrame(costs)

def aggregate_costs(df_costs: pd.DataFrame):
    df_aggr_costs = df_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
        experiment_count=("algorithm", "count"),
        real_cost_mean=("real_cost", "mean"),
        real_cost_std=("real_cost", "std"),
        coreset_cost_mean=("coreset_cost", "mean"),
        coreset_cost_std=("coreset_cost", "std"),
        running_time_mean=("running_time", "mean"),
        running_time_std=("running_time", "std"),
    )
    df_aggr_costs["ratio1"] = df_aggr_costs["real_cost_mean"] / df_aggr_costs["coreset_cost_mean"]
    df_aggr_costs["ratio2"] = df_aggr_costs["coreset_cost_mean"] / df_aggr_costs["real_cost_mean"]
    df_aggr_costs["distortion"] = df_aggr_costs[["ratio1", "ratio2"]].max(axis=1)
    df_aggr_costs["running_time"] = pd.to_timedelta(df_aggr_costs.running_time_mean, unit='s')
    df_aggr_costs["running_time_formatted"] = df_aggr_costs["running_time"].map(
        lambda x: f"{x.components.hours:02d}h {x.components.minutes:02d}m {x.components.seconds:02d}s"
    )
    return df_aggr_costs


def display_results_for(df_aggr_costs: pd.DataFrame, dataset_name: str, show_counts: bool=True, show_running_times:bool=True, show_costs: bool=True):
    pd.set_option('display.float_format', '{:.2e}'.format)
    
    display(HTML(f"<h2 style='border-bottom:solid 1px Black;padding-bottom:5px;'>Results for {dataset_name}</h2>"))
    df_filtered = df_aggr_costs[df_aggr_costs.dataset == dataset_name]
    
    html_str = ""
    
    if show_counts:
        df_counts = pd.pivot_table(df_filtered, values="experiment_count", index=["algorithm"], columns=["k"])
        df_counts = df_counts.rename_axis(None, axis=0).rename_axis(None, axis=1)
        html_str += f'<div style="border:solid 1px White; width:300px;float:left;">'
        html_str += f"<h4 style='text-align:center;'>Experiment Counts</h4>"
        html_str += df_counts.style.format(precision=0).to_html()
        html_str += "</div>"
    
    if show_running_times:
        df_run_times = pd.pivot_table(df_filtered, values="running_time_formatted", index=["algorithm"], columns=["k"], aggfunc=lambda x: x)
        df_run_times = df_run_times.rename_axis(None, axis=0).rename_axis(None, axis=1)

        html_str += f'<div style="border:solid 1px White; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Average Running Times</h4>"
        html_str += df_run_times.to_html()
        html_str += "</div>"
        
    display(HTML(html_str))

    if show_costs:
        df_real_costs = pd.pivot_table(df_filtered, values="real_cost_mean", index=["algorithm"], columns=["k"])
        df_real_costs = df_real_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        df_coreset_costs = pd.pivot_table(df_filtered, values="coreset_cost_mean", index=["algorithm"], columns=["k"])
        df_coreset_costs = df_coreset_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        html_str = ""
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Real costs</h4>"
        html_str += df_real_costs.to_html()
        html_str += "</div>"
        
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Coreset costs</h4>"
        html_str += df_coreset_costs.to_html()
        html_str += "</div>"
        display(HTML(html_str))
    
    display(HTML(f'<h4>Distortions</h4>'))
    df_distortions = pd.pivot_table(df_filtered, values="distortion", index=["algorithm"], columns=["k"])
    df_distortions = df_distortions.rename_axis(None, axis=0).rename_axis(None, axis=1)
    display(df_distortions)


In [4]:
data_results_dir = Path("../data/odin-results/")
run_files = list(data_results_dir.glob("**/*.json"))

In [5]:
run_files.extend(list(Path("../data/cary-results/hardinstanceb1").glob("**/*.json")))
run_files.extend(list(Path("../data/cary-results/hardinstanceb2").glob("**/*.json")))

In [6]:
len(run_files)

2186

In [7]:
df_cost_data = get_costs(run_files)



In [8]:
df_aggr_costs = aggregate_costs(df_cost_data)

In [9]:
for dataset_name in df_aggr_costs.dataset.unique():
    display_results_for(df_aggr_costs, dataset_name=dataset_name, show_counts=True, show_running_times=False, show_costs=False)

Unnamed: 0,10,20,30,40,50
basic-clustering,7,6,5,5,4
bico,20,20,20,20,20
group-sampling,20,20,21,21,22
sensitivity-sampling,20,21,22,22,22


Unnamed: 0,10,20,30,40,50
basic-clustering,1.23,1.25,1.27,1.28,1.29
bico,1.66,1.73,1.76,1.83,1.87
group-sampling,1.02,1.02,1.02,1.02,1.02
sensitivity-sampling,1.01,1.01,1.01,1.0,1.0


Unnamed: 0,10,20,30,40,50
basic-clustering,7,6,5,5,4
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20


Unnamed: 0,10,20,30,40,50
basic-clustering,1.05,1.14,1.21,1.26,1.29
bico,1.18,1.44,1.63,1.79,1.84
group-sampling,1.01,1.02,1.02,1.02,1.02
sensitivity-sampling,1.01,1.01,1.0,1.0,1.0


Unnamed: 0,10,20,30,40,50
basic-clustering,7,7,6,6,6
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20


Unnamed: 0,10,20,30,40,50
basic-clustering,1.04,1.04,1.04,1.04,1.04
bico,1.1,1.11,1.1,1.09,1.07
group-sampling,1.02,1.02,1.02,1.02,1.03
sensitivity-sampling,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,10,20,30,40,50
basic-clustering,7,6,6,6,6
bico,10,10,18,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20


Unnamed: 0,10,20,30,40,50
basic-clustering,1.04,1.04,1.04,1.04,1.04
bico,1.1,1.11,1.1,1.09,1.07
group-sampling,1.02,1.02,1.02,1.02,1.03
sensitivity-sampling,1.01,1.01,1.0,1.0,1.0


Unnamed: 0,20,30,40,50
basic-clustering,2,4,4,4


Unnamed: 0,20,30,40,50
basic-clustering,1.19,1.13,1.09,1.07


Unnamed: 0,10,20,30,40,50
bico,20,20,20,20,20
group-sampling,20,20,20,20,4
sensitivity-sampling,20,19,20,20,4


Unnamed: 0,10,20,30,40,50
bico,3.9,3.77,3.34,4.01,4.31
group-sampling,1.01,1.01,1.01,1.01,1.01
sensitivity-sampling,1.0,1.01,1.0,1.0,1.01


Unnamed: 0,10,20,30,40,50
bico,20,20,20,20,20
group-sampling,20,20,20,20,9
sensitivity-sampling,20,20,20,20,2


Unnamed: 0,10,20,30,40,50
bico,1.1,1.29,1.25,1.24,1.23
group-sampling,1.01,1.01,1.01,1.01,1.01
sensitivity-sampling,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,20,40,60,80,100
basic-clustering,6,6,6,5,5
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20


Unnamed: 0,20,40,60,80,100
basic-clustering,1.02,1.02,1.01,1.01,1.01
bico,1.06,1.06,1.06,1.05,1.04
group-sampling,1.02,1.02,1.03,1.03,1.03
sensitivity-sampling,1.01,1.01,1.01,1.01,1.0
