In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import re

from pathlib import Path

import numpy as np
import pandas as pd

from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

from xrun.data.run_info import RunInfo

In [3]:
def load_cost_from_file(file_path: Path):
    if file_path.exists():
            with open(file_path, "r") as f:
                return float(f.read())
    else:
        print(f"Warning: {file_path} not found!")
    return None

def get_costs(file_paths):
    costs = []
    for index, run_file_path in enumerate(file_paths):
        run_info = RunInfo.load_json(run_file_path)
        real_cost = load_cost_from_file(run_file_path.parent / "real_cost.txt")
        coreset_cost = load_cost_from_file(run_file_path.parent / "coreset_cost.txt")
        costs.append({
            "dataset": run_info.dataset,
            "algorithm": run_info.algorithm,
            "k": run_info.k,
            "running_time": int(run_info.duration_secs),
            "real_cost": real_cost,
            "coreset_cost": coreset_cost,
            "run_file_path": str(run_file_path),
        })
    return pd.DataFrame(costs)

def aggregate_costs(df_costs: pd.DataFrame):
    df_aggr_costs = df_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
        experiment_count=("algorithm", "count"),
        real_cost_mean=("real_cost", "mean"),
        real_cost_std=("real_cost", "std"),
        coreset_cost_mean=("coreset_cost", "mean"),
        coreset_cost_std=("coreset_cost", "std"),
        running_time_mean=("running_time", "mean"),
        running_time_std=("running_time", "std"),
    )
    df_aggr_costs["ratio1"] = df_aggr_costs["real_cost_mean"] / df_aggr_costs["coreset_cost_mean"]
    df_aggr_costs["ratio2"] = df_aggr_costs["coreset_cost_mean"] / df_aggr_costs["real_cost_mean"]
    df_aggr_costs["distortion"] = df_aggr_costs[["ratio1", "ratio2"]].max(axis=1)
    df_aggr_costs["running_time"] = pd.to_timedelta(df_aggr_costs.running_time_mean, unit='s')
    df_aggr_costs["running_time_formatted"] = df_aggr_costs["running_time"].map(
        lambda x: f"{x.components.hours:02d}h {x.components.minutes:02d}m {x.components.seconds:02d}s"
    )
    return df_aggr_costs


def display_results_for(df_aggr_costs: pd.DataFrame, dataset_name: str, show_counts: bool=True, show_running_times:bool=True):
    display(HTML(f"<h2 style='border-bottom:solid 1px Black;padding-bottom:5px;'>Results for {dataset_name}</h2>"))
    df_filtered = df_aggr_costs[df_aggr_costs.dataset == dataset_name]
    
    html_str = ""
    
    if show_counts:
        df_counts = pd.pivot_table(df_filtered, values="experiment_count", index=["algorithm"], columns=["k"])
        df_counts = df_counts.rename_axis(None, axis=0).rename_axis(None, axis=1)
        html_str += f'<div style="border:solid 1px White; width:300px;float:left;">'
        html_str += f"<h4 style='text-align:center;'>Experiment Counts</h4>"
        html_str += df_counts.style.format(precision=0).to_html()
        html_str += "</div>"
    
    if show_running_times:
        df_run_times = pd.pivot_table(df_filtered, values="running_time_formatted", index=["algorithm"], columns=["k"], aggfunc=lambda x: x)
        df_run_times = df_run_times.rename_axis(None, axis=0).rename_axis(None, axis=1)

        html_str += f'<div style="border:solid 1px White; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Average Running Times</h4>"
        html_str += df_run_times.to_html()
        html_str += "</div>"
        
    display(HTML(html_str))
    
    display(HTML(f'<h4>Distortions</h4>'))
    df_distortions = pd.pivot_table(df_filtered, values="distortion", index=["algorithm"], columns=["k"])
    df_distortions = df_distortions.rename_axis(None, axis=0).rename_axis(None, axis=1)
    display(df_distortions)


In [4]:
data_results_dir = Path("../data/odin-results/")
run_files = list(data_results_dir.glob("**/*.json"))

In [5]:
len(run_files)

1116

In [6]:
df_cost_data = get_costs(run_files)

In [7]:
df_aggr_costs = aggregate_costs(df_cost_data)

In [8]:
for dataset_name in df_aggr_costs.dataset.unique():
    display_results_for(df_aggr_costs, dataset_name=dataset_name, show_counts=True, show_running_times=True)

Unnamed: 0,10,20,30,40,50
basic-clustering,3,3,,,
bico,20,20,20.0,20.0,20.0
group-sampling,14,14,15.0,15.0,16.0
sensitivity-sampling,14,15,16.0,16.0,16.0

Unnamed: 0,10,20,30,40,50
basic-clustering,04h 18m 25s,07h 28m 15s,,,
bico,00h 01m 00s,00h 01m 07s,00h 01m 18s,00h 01m 25s,00h 01m 34s
group-sampling,01h 00m 23s,02h 42m 25s,03h 50m 01s,06h 34m 42s,07h 23m 48s
sensitivity-sampling,01h 07m 08s,02h 35m 19s,03h 28m 47s,05h 40m 59s,06h 52m 26s


Unnamed: 0,10,20,30,40,50
basic-clustering,1.225943,1.243184,,,
bico,1.655065,1.728588,1.763595,1.828484,1.86675
group-sampling,1.019881,1.019735,1.021904,1.020314,1.022757
sensitivity-sampling,1.002494,1.006281,1.007697,1.003942,1.002497


Unnamed: 0,10,20,30,40,50
bico,10,10,8,8,8
group-sampling,7,7,6,6,5
sensitivity-sampling,7,6,5,6,4

Unnamed: 0,10,20,30,40,50
bico,00h 01m 21s,00h 01m 52s,00h 01m 56s,00h 01m 58s,00h 02m 15s
group-sampling,01h 05m 49s,02h 48m 17s,03h 23m 11s,04h 59m 12s,08h 34m 21s
sensitivity-sampling,01h 15m 56s,01h 54m 44s,03h 00m 38s,05h 39m 15s,07h 19m 25s


Unnamed: 0,10,20,30,40,50
bico,1.272222,1.540486,1.674887,1.813657,1.880873
group-sampling,1.022497,1.019204,1.021788,1.020931,1.023174
sensitivity-sampling,1.007813,1.007106,1.002867,1.001586,1.0002


Unnamed: 0,10,20,30,40,50
basic-clustering,3,3,3,3,3
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20

Unnamed: 0,10,20,30,40,50
basic-clustering,00h 49m 44s,01h 42m 42s,02h 33m 25s,03h 06m 07s,03h 54m 28s
bico,00h 00m 09s,00h 00m 10s,00h 00m 12s,00h 00m 14s,00h 00m 16s
group-sampling,00h 37m 17s,01h 26m 33s,02h 25m 15s,03h 25m 51s,04h 29m 59s
sensitivity-sampling,00h 39m 44s,01h 27m 20s,02h 36m 57s,03h 32m 24s,04h 25m 59s


Unnamed: 0,10,20,30,40,50
basic-clustering,1.04207,1.042917,1.041283,1.03958,1.037101
bico,1.103825,1.11096,1.103353,1.088147,1.073087
group-sampling,1.01944,1.024228,1.024338,1.024848,1.025347
sensitivity-sampling,1.003742,1.00051,1.000407,1.002329,1.000478


Unnamed: 0,10,20,30,40,50
bico,8,8,16,17,17
group-sampling,7,6,6,6,6
sensitivity-sampling,7,6,6,6,5

Unnamed: 0,10,20,30,40,50
bico,00h 00m 11s,00h 00m 13s,00h 00m 14s,00h 00m 16s,00h 00m 17s
group-sampling,00h 34m 41s,01h 44m 09s,02h 31m 19s,03h 36m 06s,04h 28m 58s
sensitivity-sampling,00h 37m 27s,01h 39m 41s,02h 43m 35s,03h 23m 57s,04h 35m 35s


Unnamed: 0,10,20,30,40,50
bico,1.104777,1.113556,1.101822,1.087904,1.073051
group-sampling,1.023001,1.023423,1.025587,1.023914,1.026046
sensitivity-sampling,1.008941,1.010332,1.004449,1.005015,1.000607


Unnamed: 0,20,30,40,50
basic-clustering,2,4,4,4

Unnamed: 0,20,30,40,50
basic-clustering,00h 09m 57s,02h 06m 27s,02h 19m 29s,18h 03m 31s


Unnamed: 0,20,30,40,50
basic-clustering,1.192022,1.133528,1.094553,1.070319


Unnamed: 0,20,40,60,80,100
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20

Unnamed: 0,20,40,60,80,100
bico,00h 00m 09s,00h 00m 09s,00h 00m 09s,00h 00m 10s,00h 00m 12s
group-sampling,00h 47m 36s,01h 36m 34s,02h 13m 30s,02h 54m 07s,03h 42m 40s
sensitivity-sampling,00h 44m 21s,01h 26m 39s,02h 09m 52s,02h 50m 04s,03h 06m 16s


Unnamed: 0,20,40,60,80,100
bico,1.063087,1.062726,1.058409,1.051088,1.04241
group-sampling,1.015564,1.024515,1.029711,1.029909,1.032006
sensitivity-sampling,1.011195,1.008577,1.006065,1.005772,1.002651
