In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import re

from pathlib import Path

import numpy as np
import pandas as pd

from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

from xrun.data.run_info import RunInfo

In [3]:
def load_cost_from_file(file_path: Path):
    if file_path.exists():
            with open(file_path, "r") as f:
                return float(f.read())
    else:
        print(f"Warning: {file_path} not found!")
    return None

def get_costs(file_paths):
    costs = []
    for index, run_file_path in enumerate(file_paths):
        run_info = RunInfo.load_json(run_file_path)
        real_cost = load_cost_from_file(run_file_path.parent / "real_cost.txt")
        coreset_cost = load_cost_from_file(run_file_path.parent / "coreset_cost.txt")
        costs.append({
            "dataset": run_info.dataset,
            "algorithm": run_info.algorithm,
            "k": run_info.k,
            "running_time": int(run_info.duration_secs),
            "real_cost": real_cost,
            "coreset_cost": coreset_cost,
            "run_file_path": str(run_file_path),
        })
    return pd.DataFrame(costs)

def aggregate_costs(df_costs: pd.DataFrame):
    df_aggr_costs = df_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
        experiment_count=("algorithm", "count"),
        real_cost_mean=("real_cost", "mean"),
        real_cost_std=("real_cost", "std"),
        coreset_cost_mean=("coreset_cost", "mean"),
        coreset_cost_std=("coreset_cost", "std"),
        running_time_mean=("running_time", "mean"),
        running_time_std=("running_time", "std"),
    )
    df_aggr_costs["ratio1"] = df_aggr_costs["real_cost_mean"] / df_aggr_costs["coreset_cost_mean"]
    df_aggr_costs["ratio2"] = df_aggr_costs["coreset_cost_mean"] / df_aggr_costs["real_cost_mean"]
    df_aggr_costs["distortion"] = df_aggr_costs[["ratio1", "ratio2"]].max(axis=1)
    df_aggr_costs["running_time"] = pd.to_timedelta(df_aggr_costs.running_time_mean, unit='s')
    df_aggr_costs["running_time_formatted"] = df_aggr_costs["running_time"].map(
        lambda x: f"{x.components.hours:02d}h {x.components.minutes:02d}m {x.components.seconds:02d}s"
    )
    return df_aggr_costs


def display_results_for(df_aggr_costs: pd.DataFrame, dataset_name: str, show_counts: bool=True, show_running_times:bool=True, show_costs: bool=True):
    pd.set_option('display.float_format', '{:.2e}'.format)
    
    display(HTML(f"<h2 style='border-bottom:solid 1px Black;padding-bottom:5px;'>Results for {dataset_name}</h2>"))
    df_filtered = df_aggr_costs[df_aggr_costs.dataset == dataset_name]
    
    html_str = ""
    
    if show_counts:
        df_counts = pd.pivot_table(df_filtered, values="experiment_count", index=["algorithm"], columns=["k"])
        df_counts = df_counts.rename_axis(None, axis=0).rename_axis(None, axis=1)
        html_str += f'<div style="border:solid 1px White; width:300px;float:left;">'
        html_str += f"<h4 style='text-align:center;'>Experiment Counts</h4>"
        html_str += df_counts.style.format(precision=0).to_html()
        html_str += "</div>"
    
    if show_running_times:
        df_run_times = pd.pivot_table(df_filtered, values="running_time_formatted", index=["algorithm"], columns=["k"], aggfunc=lambda x: x)
        df_run_times = df_run_times.rename_axis(None, axis=0).rename_axis(None, axis=1)

        html_str += f'<div style="border:solid 1px White; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Average Running Times</h4>"
        html_str += df_run_times.to_html()
        html_str += "</div>"
        
    display(HTML(html_str))

    if show_costs:
        df_real_costs = pd.pivot_table(df_filtered, values="real_cost_mean", index=["algorithm"], columns=["k"])
        df_real_costs = df_real_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        df_coreset_costs = pd.pivot_table(df_filtered, values="coreset_cost_mean", index=["algorithm"], columns=["k"])
        df_coreset_costs = df_coreset_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        html_str = ""
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Real costs</h4>"
        html_str += df_real_costs.to_html()
        html_str += "</div>"
        
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Coreset costs</h4>"
        html_str += df_coreset_costs.to_html()
        html_str += "</div>"
        display(HTML(html_str))
    
    display(HTML(f'<h4>Distortions</h4>'))
    df_distortions = pd.pivot_table(df_filtered, values="distortion", index=["algorithm"], columns=["k"])
    df_distortions = df_distortions.rename_axis(None, axis=0).rename_axis(None, axis=1)
    display(df_distortions)


In [4]:
data_results_dir = Path("../data/odin-results/")
run_files = list(data_results_dir.glob("**/*.json"))

In [5]:
len(run_files)

1116

In [6]:
df_cost_data = get_costs(run_files)

In [7]:
df_aggr_costs = aggregate_costs(df_cost_data)

In [8]:
for dataset_name in df_aggr_costs.dataset.unique():
    display_results_for(df_aggr_costs, dataset_name=dataset_name, show_counts=True, show_running_times=True, show_costs=True)

Unnamed: 0,10,20,30,40,50
basic-clustering,3,3,,,
bico,20,20,20.0,20.0,20.0
group-sampling,14,14,15.0,15.0,16.0
sensitivity-sampling,14,15,16.0,16.0,16.0

Unnamed: 0,10,20,30,40,50
basic-clustering,04h 18m 25s,07h 28m 15s,,,
bico,00h 01m 00s,00h 01m 07s,00h 01m 18s,00h 01m 25s,00h 01m 34s
group-sampling,01h 00m 23s,02h 42m 25s,03h 50m 01s,06h 34m 42s,07h 23m 48s
sensitivity-sampling,01h 07m 08s,02h 35m 19s,03h 28m 47s,05h 40m 59s,06h 52m 26s


Unnamed: 0,10,20,30,40,50
basic-clustering,246000000.0,189000000.0,,,
bico,249000000.0,189000000.0,159000000.0,141000000.0,129000000.0
group-sampling,250000000.0,190000000.0,161000000.0,142000000.0,129000000.0
sensitivity-sampling,251000000.0,189000000.0,162000000.0,141000000.0,129000000.0

Unnamed: 0,10,20,30,40,50
basic-clustering,201000000.0,152000000.0,,,
bico,150000000.0,110000000.0,90400000.0,77300000.0,68900000.0
group-sampling,245000000.0,186000000.0,157000000.0,139000000.0,126000000.0
sensitivity-sampling,252000000.0,191000000.0,163000000.0,142000000.0,129000000.0


Unnamed: 0,10,20,30,40,50
basic-clustering,1.23,1.24,,,
bico,1.66,1.73,1.76,1.83,1.87
group-sampling,1.02,1.02,1.02,1.02,1.02
sensitivity-sampling,1.0,1.01,1.01,1.0,1.0


Unnamed: 0,10,20,30,40,50
bico,10,10,8,8,8
group-sampling,7,7,6,6,5
sensitivity-sampling,7,6,5,6,4

Unnamed: 0,10,20,30,40,50
bico,00h 01m 21s,00h 01m 52s,00h 01m 56s,00h 01m 58s,00h 02m 15s
group-sampling,01h 05m 49s,02h 48m 17s,03h 23m 11s,04h 59m 12s,08h 34m 21s
sensitivity-sampling,01h 15m 56s,01h 54m 44s,03h 00m 38s,05h 39m 15s,07h 19m 25s


Unnamed: 0,10,20,30,40,50
bico,182000000.0,169000000.0,153000000.0,138000000.0,128000000.0
group-sampling,178000000.0,172000000.0,154000000.0,140000000.0,129000000.0
sensitivity-sampling,179000000.0,171000000.0,154000000.0,139000000.0,127000000.0

Unnamed: 0,10,20,30,40,50
bico,143000000.0,110000000.0,91300000.0,76100000.0,68300000.0
group-sampling,174000000.0,169000000.0,150000000.0,137000000.0,126000000.0
sensitivity-sampling,180000000.0,173000000.0,154000000.0,139000000.0,127000000.0


Unnamed: 0,10,20,30,40,50
bico,1.27,1.54,1.67,1.81,1.88
group-sampling,1.02,1.02,1.02,1.02,1.02
sensitivity-sampling,1.01,1.01,1.0,1.0,1.0


Unnamed: 0,10,20,30,40,50
basic-clustering,3,3,3,3,3
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20

Unnamed: 0,10,20,30,40,50
basic-clustering,00h 49m 44s,01h 42m 42s,02h 33m 25s,03h 06m 07s,03h 54m 28s
bico,00h 00m 09s,00h 00m 10s,00h 00m 12s,00h 00m 14s,00h 00m 16s
group-sampling,00h 37m 17s,01h 26m 33s,02h 25m 15s,03h 25m 51s,04h 29m 59s
sensitivity-sampling,00h 39m 44s,01h 27m 20s,02h 36m 57s,03h 32m 24s,04h 25m 59s


Unnamed: 0,10,20,30,40,50
basic-clustering,340000000000.0,204000000000.0,155000000000.0,129000000000.0,113000000000.0
bico,341000000000.0,205000000000.0,156000000000.0,130000000000.0,114000000000.0
group-sampling,343000000000.0,206000000000.0,156000000000.0,130000000000.0,114000000000.0
sensitivity-sampling,344000000000.0,206000000000.0,156000000000.0,131000000000.0,115000000000.0

Unnamed: 0,10,20,30,40,50
basic-clustering,326000000000.0,195000000000.0,149000000000.0,124000000000.0,109000000000.0
bico,309000000000.0,184000000000.0,141000000000.0,119000000000.0,106000000000.0
group-sampling,336000000000.0,201000000000.0,153000000000.0,127000000000.0,112000000000.0
sensitivity-sampling,345000000000.0,206000000000.0,156000000000.0,131000000000.0,115000000000.0


Unnamed: 0,10,20,30,40,50
basic-clustering,1.04,1.04,1.04,1.04,1.04
bico,1.1,1.11,1.1,1.09,1.07
group-sampling,1.02,1.02,1.02,1.02,1.03
sensitivity-sampling,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,10,20,30,40,50
bico,8,8,16,17,17
group-sampling,7,6,6,6,6
sensitivity-sampling,7,6,6,6,5

Unnamed: 0,10,20,30,40,50
bico,00h 00m 11s,00h 00m 13s,00h 00m 14s,00h 00m 16s,00h 00m 17s
group-sampling,00h 34m 41s,01h 44m 09s,02h 31m 19s,03h 36m 06s,04h 28m 58s
sensitivity-sampling,00h 37m 27s,01h 39m 41s,02h 43m 35s,03h 23m 57s,04h 35m 35s


Unnamed: 0,10,20,30,40,50
bico,340000000000.0,204000000000.0,156000000000.0,130000000000.0,114000000000.0
group-sampling,342000000000.0,205000000000.0,156000000000.0,130000000000.0,114000000000.0
sensitivity-sampling,347000000000.0,207000000000.0,157000000000.0,131000000000.0,115000000000.0

Unnamed: 0,10,20,30,40,50
bico,308000000000.0,183000000000.0,142000000000.0,120000000000.0,106000000000.0
group-sampling,334000000000.0,201000000000.0,152000000000.0,127000000000.0,111000000000.0
sensitivity-sampling,350000000000.0,209000000000.0,156000000000.0,131000000000.0,115000000000.0


Unnamed: 0,10,20,30,40,50
bico,1.1,1.11,1.1,1.09,1.07
group-sampling,1.02,1.02,1.03,1.02,1.03
sensitivity-sampling,1.01,1.01,1.0,1.01,1.0


Unnamed: 0,20,30,40,50
basic-clustering,2,4,4,4

Unnamed: 0,20,30,40,50
basic-clustering,00h 09m 57s,02h 06m 27s,02h 19m 29s,18h 03m 31s


Unnamed: 0,20,30,40,50
basic-clustering,21200000.0,19900000.0,19500000.0,18500000.0

Unnamed: 0,20,30,40,50
basic-clustering,17800000.0,17500000.0,17800000.0,17200000.0


Unnamed: 0,20,30,40,50
basic-clustering,1.19,1.13,1.09,1.07


Unnamed: 0,20,40,60,80,100
bico,20,20,20,20,20
group-sampling,20,20,20,20,20
sensitivity-sampling,20,20,20,20,20

Unnamed: 0,20,40,60,80,100
bico,00h 00m 09s,00h 00m 09s,00h 00m 09s,00h 00m 10s,00h 00m 12s
group-sampling,00h 47m 36s,01h 36m 34s,02h 13m 30s,02h 54m 07s,03h 42m 40s
sensitivity-sampling,00h 44m 21s,01h 26m 39s,02h 09m 52s,02h 50m 04s,03h 06m 16s


Unnamed: 0,20,40,60,80,100
bico,622000000.0,331000000.0,242000000.0,193000000.0,164000000.0
group-sampling,621000000.0,334000000.0,244000000.0,196000000.0,165000000.0
sensitivity-sampling,629000000.0,335000000.0,245000000.0,195000000.0,166000000.0

Unnamed: 0,20,40,60,80,100
bico,585000000.0,312000000.0,228000000.0,184000000.0,157000000.0
group-sampling,612000000.0,326000000.0,237000000.0,190000000.0,160000000.0
sensitivity-sampling,636000000.0,338000000.0,246000000.0,196000000.0,166000000.0


Unnamed: 0,20,40,60,80,100
bico,1.06,1.06,1.06,1.05,1.04
group-sampling,1.02,1.02,1.03,1.03,1.03
sensitivity-sampling,1.01,1.01,1.01,1.01,1.0
