In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import re

from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

from xrun.data.run_info import RunInfo

In [3]:
def load_cost_from_file(file_path: Path):
    if file_path.exists():
            with open(file_path, "r") as f:
                return float(f.read())
    else:
        # print(f"Warning: {file_path} not found!")
        pass
    return None

def get_algorithm_name(run_info):
    if run_info.algorithm == "basic-clustering":
        return "StreamKM++ (buggy)"
    elif run_info.algorithm == "sensitivity-sampling":
        return "Sensitivity Sampling"
    elif run_info.algorithm == "group-sampling":
        return "Group Sampling"
    elif run_info.algorithm == "bico":
        return "BICO"
    elif run_info.algorithm == "stream-km++":
        return "StreamKM++"
    elif run_info.algorithm == "ray-maker":
        return "Ray Maker"
    return "Unknown"

def get_dataset_print_name(run_info):
    d = {
        "census": "Census",
        "censuslowd": "Census+PCA",
        "covertype": "Covertype",
        "covertypelowd": "Covertype+PCA",
        "tower": "Tower",
        "hardinstanceb1": "Benchmark",
        "hardinstanceb15": "Benchmark-1.5",
        "hardinstanceb2": "Benchmark-2.0",
        "oldhardinstanceb1": "Old Benchmark",
        "oldhardinstanceb2": "Old Benchmark-2.0",
        "caltech101": "Caltech",
        "caltech101lowd": "Caltech+PCA",
        "oldcaltech101": "Old Caltech",
        "nytimes100d": "NYTimes",
        "nytimespcalowd": "NYTimes+PCA",
    }
    if run_info.dataset in d:
        return d[run_info.dataset]
    return run_info.dataset
    

def get_costs(file_paths):
    costs = []
    for index, run_file_path in enumerate(file_paths):
        run_info = RunInfo.load_json(run_file_path)
        if "hardinstance" in run_info.dataset and run_info.k > 40:
            continue
        real_cost = load_cost_from_file(run_file_path.parent / "real_cost.txt")
        coreset_cost = load_cost_from_file(run_file_path.parent / "coreset_cost.txt")
        distortion = None
        if real_cost is not None and coreset_cost is not None:
            distortion = max(float(real_cost/coreset_cost), float(coreset_cost/real_cost))
        costs.append({
            "dataset": get_dataset_print_name(run_info),
            "algorithm": get_algorithm_name(run_info),
            "k": run_info.k,
            "coreset_size": run_info.m,
            "coreset_size_factor": int(run_info.m / run_info.k),
            "running_time": int(run_info.duration_secs),
            "real_cost": real_cost,
            "coreset_cost": coreset_cost,
            "distortion": distortion,
            "run_file_path": str(run_file_path),
        })
    df_data = pd.DataFrame(costs)
    #return df_data.groupby(["dataset", "algorithm", "k", "coreset_size"], as_index=False).head(10)
    return df_data

def aggregate_costs(df_costs: pd.DataFrame):
    group_columns = ["dataset", "algorithm", "k", "coreset_size", "coreset_size_factor"]
    df_top_k = df_costs.groupby(group_columns, as_index=False).head(10)
    df_aggr_costs = df_top_k.groupby(group_columns, as_index=False).agg(
        experiment_count=("algorithm", "count"),
        real_cost_mean=("real_cost", "mean"),
        real_cost_std=("real_cost", "std"),
        coreset_cost_mean=("coreset_cost", "mean"),
        coreset_cost_std=("coreset_cost", "std"),
        running_time_mean=("running_time", "mean"),
        running_time_std=("running_time", "std"),
        distortion_mean=("distortion", "mean"),
        distortion_median=("distortion", "median"),
        distortion_std=("distortion", "std"),
        distortion_max=("distortion", "max"),
    )
    df_aggr_costs["running_time"] = pd.to_timedelta(df_aggr_costs.running_time_mean, unit='s')
    df_aggr_costs["running_time_formatted"] = df_aggr_costs["running_time"].map(
        lambda x: f"{x.components.hours:02d}h {x.components.minutes:02d}m {x.components.seconds:02d}s"
    )
    return df_aggr_costs


def display_results_for(df_aggr_costs: pd.DataFrame, dataset_name: str, show_counts: bool=True, show_running_times:bool=True, show_costs: bool=True):
    pd.set_option('display.float_format', '{:.2e}'.format)
    
    display(HTML(f"<h2 style='border-bottom:solid 1px Black;padding-bottom:5px;'>Results for {dataset_name}</h2>"))
    df_filtered = df_aggr_costs[df_aggr_costs.dataset == dataset_name]
    
    html_str = ""
    
    if show_counts:
        df_counts = pd.pivot_table(df_filtered, values="experiment_count", index=["algorithm"], columns=["k"])
        df_counts = df_counts.rename_axis(None, axis=0).rename_axis(None, axis=1)
        html_str += f'<div style="border:solid 1px White; width:300px;float:left;">'
        html_str += f"<h4 style='text-align:center;'>Experiment Counts</h4>"
        html_str += df_counts.style.format(precision=0).to_html()
        html_str += "</div>"
    
    if show_running_times:
        df_run_times = pd.pivot_table(df_filtered, values="running_time_formatted", index=["algorithm"], columns=["k"], aggfunc=lambda x: x)
        df_run_times = df_run_times.rename_axis(None, axis=0).rename_axis(None, axis=1)

        html_str += f'<div style="border:solid 1px White; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Average Running Times</h4>"
        html_str += df_run_times.to_html()
        html_str += "</div>"
        
    display(HTML(html_str))

    if show_costs:
        df_real_costs = pd.pivot_table(df_filtered, values="real_cost_mean", index=["algorithm"], columns=["k"])
        df_real_costs = df_real_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        df_coreset_costs = pd.pivot_table(df_filtered, values="coreset_cost_mean", index=["algorithm"], columns=["k"])
        df_coreset_costs = df_coreset_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        html_str = ""
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Real costs</h4>"
        html_str += df_real_costs.to_html()
        html_str += "</div>"
        
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Coreset costs</h4>"
        html_str += df_coreset_costs.to_html()
        html_str += "</div>"
        display(HTML(html_str))
    
    display(HTML(f'<h4>Distortions</h4>'))
    df_distortions = pd.pivot_table(df_filtered, values="distortion_max", index=["algorithm"], columns=["k"])
    df_distortions = df_distortions.rename_axis(None, axis=0).rename_axis(None, axis=1)
    display(df_distortions)
    
def add_combined_mean_std(df: pd.DataFrame, attr: str):
    def combiner(row) -> str:
        mean = row[f"{attr}_mean"]
        std =  row[f"{attr}_std"]
        return f"{mean:0.2f} ({std:0.3f})"
    df[f"{attr}_mean_std"] = df.apply(lambda x: combiner(x), axis=1)

In [4]:
data_results_dir = Path("../data/experiments-odin/")
run_files = list(data_results_dir.glob("**/*.json"))

In [5]:
len(run_files)

2025

In [6]:
df_cost_data = get_costs(run_files)

In [7]:
include_datasets = [
    'Caltech', 'Caltech+PCA', 'Census', 'Census+PCA', 'Covertype', 'Covertype+PCA', 
    'Tower', 'NYTimes', 'NYTimes+PCA', 
    # 'Benchmark-1.0', 'Benchmark-1.5', 'Benchmark-2.0',
    'Benchmark',
]
include_algorithms = [
    'StreamKM++', 'Group Sampling', 'Ray Maker', 'BICO', 'Sensitivity Sampling', 
    # 'StreamKM++ (buggy)'
]
df_cost_filtered = df_cost_data[df_cost_data.algorithm.isin(include_algorithms) & df_cost_data.dataset.isin(include_datasets)]
#df_cost_filtered = df_cost_data[df_cost_data.algorithm.isin(include_algorithms)]

In [8]:
df_aggr_costs = aggregate_costs(df_cost_filtered)

In [9]:
colors = sns.color_palette("tab10")
algorithm_colors = {algo: colors[i] for i, algo in enumerate(df_aggr_costs.algorithm.unique())}

In [10]:
df_aggr_costs["k_formatted"] = "$k$=" + df_aggr_costs["k"].astype(str)

In [11]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_mean_std",
    index=["dataset", "k", "coreset_size"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)

df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Covertype,10,500,1.25 (0.013),1.07 (0.010),1.31 (0.025),1.04 (0.017),1.05 (0.010)
Covertype,10,1000,1.16 (0.015),1.05 (0.008),1.22 (0.005),1.03 (0.011),1.03 (0.003)
Covertype,10,2000,1.11 (0.000),1.03 (0.011),1.16 (0.003),1.02 (nan),1.01 (nan)
Covertype,20,1000,1.26 (0.015),1.10 (0.022),1.34 (0.020),1.04 (0.007),1.04 (0.007)
Covertype,20,2000,1.18 (0.003),1.05 (0.010),1.25 (0.008),1.02 (0.007),1.03 (nan)
Covertype,20,4000,1.11 (0.003),1.03 (0.000),1.22 (0.012),1.01 (nan),1.01 (nan)
Covertype,30,1500,1.29 (0.008),1.08 (0.009),1.38 (0.011),1.03 (0.015),1.05 (0.004)
Covertype,30,3000,1.16 (0.003),1.05 (0.008),1.26 (0.012),1.01 (0.006),1.03 (nan)
Covertype,30,6000,1.10 (0.001),1.02 (0.011),1.21 (0.014),1.02 (nan),1.01 (nan)
Covertype,40,2000,1.27 (0.011),1.09 (0.016),1.38 (0.011),1.03 (0.015),1.05 (0.003)


In [16]:
add_combined_mean_std(df=df_aggr_costs, attr="running_time")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="running_time_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)

df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Covertype,10,50,7.25 (1.165),51.00 (2.211),58.25 (1.581),44.71 (3.251),291.29 (4.751)
Covertype,10,100,8.14 (0.900),56.00 (0.816),62.57 (3.867),44.14 (2.116),573.33 (20.116)
Covertype,10,200,9.00 (1.414),53.67 (1.211),75.00 (2.828),47.50 (4.950),1091.00 (15.556)
Covertype,20,50,8.50 (0.926),84.40 (4.648),66.29 (1.704),69.29 (3.094),563.14 (7.515)
Covertype,20,100,8.57 (0.535),87.43 (2.070),71.14 (3.934),69.00 (2.449),1092.00 (18.385)
Covertype,20,200,11.00 (1.414),90.00 (5.657),83.50 (2.121),68.50 (0.707),2196.50 (50.205)
Covertype,30,50,8.50 (0.535),116.75 (3.059),78.57 (2.878),91.57 (1.988),841.43 (10.814)
Covertype,30,100,9.43 (0.535),119.00 (1.000),82.29 (6.800),93.67 (1.966),1681.50 (7.778)
Covertype,30,200,12.00 (0.000),116.50 (0.707),84.00 (8.485),96.50 (0.707),3347.50 (53.033)
Covertype,40,50,8.00 (0.756),148.25 (4.301),89.57 (2.070),122.57 (4.392),1117.71 (28.825)


In [15]:
df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)

df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Covertype,10,50,1.25 (0.013),1.07 (0.010),1.31 (0.025),1.04 (0.017),1.05 (0.010)
Covertype,10,100,1.16 (0.015),1.05 (0.008),1.22 (0.005),1.03 (0.011),1.03 (0.003)
Covertype,10,200,1.11 (0.000),1.03 (0.011),1.16 (0.003),1.02 (nan),1.01 (nan)
Covertype,20,50,1.26 (0.015),1.10 (0.022),1.34 (0.020),1.04 (0.007),1.04 (0.007)
Covertype,20,100,1.18 (0.003),1.05 (0.010),1.25 (0.008),1.02 (0.007),1.03 (nan)
Covertype,20,200,1.11 (0.003),1.03 (0.000),1.22 (0.012),1.01 (nan),1.01 (nan)
Covertype,30,50,1.29 (0.008),1.08 (0.009),1.38 (0.011),1.03 (0.015),1.05 (0.004)
Covertype,30,100,1.16 (0.003),1.05 (0.008),1.26 (0.012),1.01 (0.006),1.03 (nan)
Covertype,30,200,1.10 (0.001),1.02 (0.011),1.21 (0.014),1.02 (nan),1.01 (nan)
Covertype,40,50,1.27 (0.011),1.09 (0.016),1.38 (0.011),1.03 (0.015),1.05 (0.003)
