In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import re

from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

from xrun.data.run_info import RunInfo

In [3]:
def load_cost_from_file(file_path: Path):
    if file_path.exists():
            with open(file_path, "r") as f:
                return float(f.read())
    else:
        # print(f"Warning: {file_path} not found!")
        pass
    return None

def get_algorithm_name(run_info):
    if run_info.algorithm == "basic-clustering":
        return "StreamKM++ (buggy)"
    elif run_info.algorithm == "sensitivity-sampling":
        return "Sensitivity Sampling"
    elif run_info.algorithm == "group-sampling":
        return "Group Sampling"
    elif run_info.algorithm == "bico":
        return "BICO"
    elif run_info.algorithm == "stream-km++":
        return "StreamKM++"
    elif run_info.algorithm == "ray-maker":
        return "Ray Maker"
    return "Unknown"

def get_dataset_print_name(run_info):
    d = {
        "census": "Census",
        "censuslowd": "Census+PCA",
        "covertype": "Covertype",
        "covertypelowd": "Covertype+PCA",
        "tower": "Tower",
        "hardinstanceb1": "Benchmark",
        "hardinstanceb15": "Benchmark-1.5",
        "hardinstanceb2": "Benchmark-2.0",
        "oldhardinstanceb1": "Old Benchmark",
        "oldhardinstanceb2": "Old Benchmark-2.0",
        "caltech101": "Caltech",
        "caltech101lowd": "Caltech+PCA",
        "oldcaltech101": "Old Caltech",
        "nytimes100d": "NYTimes",
        "nytimespcalowd": "NYTimes+PCA",
    }
    if run_info.dataset in d:
        return d[run_info.dataset]
    return run_info.dataset


def compute_distortion(real_cost, coreset_cost):
    if real_cost is not None and coreset_cost is not None:
        return max(float(real_cost/coreset_cost), float(coreset_cost/real_cost))
    return None


def get_costs(file_paths):
    costs = []
    for index, run_file_path in enumerate(file_paths):
        run_info = RunInfo.load_json(run_file_path)
        if "hardinstance" in run_info.dataset and run_info.k > 40:
            continue
        if run_info.algorithm == "stream-km++" and run_info.m > 200*run_info.k:
            continue
        real_cost = load_cost_from_file(run_file_path.parent / "real_cost.txt")
        coreset_cost = load_cost_from_file(run_file_path.parent / "coreset_cost.txt")
        real_cost_synthetic = load_cost_from_file(run_file_path.parent / "real_cost_synthetic.txt")
        coreset_cost_synthetic = load_cost_from_file(run_file_path.parent / "coreset_cost_synthetic.txt")
        real_cost_synthetic2 = load_cost_from_file(run_file_path.parent / "real_cost_convexsynthetic.txt")
        coreset_cost_synthetic2 = load_cost_from_file(run_file_path.parent / "coreset_cost_convexsynthetic.txt")
        distortion = compute_distortion(real_cost, coreset_cost)
        distortion_synthetic = compute_distortion(real_cost_synthetic, coreset_cost_synthetic)
        distortion_synthetic2 = compute_distortion(real_cost_synthetic2, coreset_cost_synthetic2)
        costs.append({
            "dataset": get_dataset_print_name(run_info),
            "algorithm": get_algorithm_name(run_info),
            "k": run_info.k,
            "coreset_size": run_info.m,
            "coreset_size_factor": int(run_info.m / run_info.k),
            "running_time": int(run_info.duration_secs),
            "real_cost": real_cost,
            "coreset_cost": coreset_cost,
            "distortion": distortion,
            "real_cost_synthetic": real_cost_synthetic,
            "coreset_cost_synthetic": coreset_cost_synthetic,
            "distortion_synthetic": distortion_synthetic,
            "distortion_synthetic2": distortion_synthetic2,
            "run_file_path": str(run_file_path),
        })
    df_data = pd.DataFrame(costs)
    return df_data.groupby(["dataset", "algorithm", "k", "coreset_size"], as_index=False).head(10)
    #return df_data

def aggregate_costs(df_costs: pd.DataFrame):
    group_columns = ["dataset", "algorithm", "k", "coreset_size", "coreset_size_factor"]
    df_top_k = df_costs.groupby(group_columns, as_index=False).head(10)
    df_aggr_costs = df_top_k.groupby(group_columns, as_index=False).agg(
        experiment_count=("algorithm", "count"),
        real_cost_mean=("real_cost", "mean"),
        real_cost_std=("real_cost", "std"),
        coreset_cost_mean=("coreset_cost", "mean"),
        coreset_cost_std=("coreset_cost", "std"),
        running_time_mean=("running_time", "mean"),
        running_time_std=("running_time", "std"),
        distortion_mean=("distortion", "mean"),
        distortion_median=("distortion", "median"),
        distortion_std=("distortion", "std"),
        distortion_max=("distortion", "max"),
        distortion_synthetic_mean=("distortion_synthetic", "mean"),
        distortion_synthetic_std=("distortion_synthetic", "std"),
        distortion_synthetic2_mean=("distortion_synthetic2", "mean"),
        distortion_synthetic2_std=("distortion_synthetic2", "std"),
    )
    df_aggr_costs["running_time"] = pd.to_timedelta(df_aggr_costs.running_time_mean, unit='s')
    df_aggr_costs["running_time_formatted"] = df_aggr_costs["running_time"].map(
        lambda x: f"{x.components.hours:02d}h {x.components.minutes:02d}m {x.components.seconds:02d}s"
    )
    return df_aggr_costs


def display_results_for(df_aggr_costs: pd.DataFrame, dataset_name: str, show_counts: bool=True, show_running_times:bool=True, show_costs: bool=True):
    pd.set_option('display.float_format', '{:.2e}'.format)
    
    display(HTML(f"<h2 style='border-bottom:solid 1px Black;padding-bottom:5px;'>Results for {dataset_name}</h2>"))
    df_filtered = df_aggr_costs[df_aggr_costs.dataset == dataset_name]
    
    html_str = ""
    
    if show_counts:
        df_counts = pd.pivot_table(df_filtered, values="experiment_count", index=["algorithm"], columns=["k"])
        df_counts = df_counts.rename_axis(None, axis=0).rename_axis(None, axis=1)
        html_str += f'<div style="border:solid 1px White; width:300px;float:left;">'
        html_str += f"<h4 style='text-align:center;'>Experiment Counts</h4>"
        html_str += df_counts.style.format(precision=0).to_html()
        html_str += "</div>"
    
    if show_running_times:
        df_run_times = pd.pivot_table(df_filtered, values="running_time_formatted", index=["algorithm"], columns=["k"], aggfunc=lambda x: x)
        df_run_times = df_run_times.rename_axis(None, axis=0).rename_axis(None, axis=1)

        html_str += f'<div style="border:solid 1px White; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Average Running Times</h4>"
        html_str += df_run_times.to_html()
        html_str += "</div>"
        
    display(HTML(html_str))

    if show_costs:
        df_real_costs = pd.pivot_table(df_filtered, values="real_cost_mean", index=["algorithm"], columns=["k"])
        df_real_costs = df_real_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        df_coreset_costs = pd.pivot_table(df_filtered, values="coreset_cost_mean", index=["algorithm"], columns=["k"])
        df_coreset_costs = df_coreset_costs.rename_axis(None, axis=0).rename_axis(None, axis=1)
        
        html_str = ""
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Real costs</h4>"
        html_str += df_real_costs.to_html()
        html_str += "</div>"
        
        html_str += f'<div style="border:solid 1px #eee; float:left;">'
        html_str += f"<h4 style='text-align:center;'>Coreset costs</h4>"
        html_str += df_coreset_costs.to_html()
        html_str += "</div>"
        display(HTML(html_str))
    
    display(HTML(f'<h4>Distortions</h4>'))
    df_distortions = pd.pivot_table(df_filtered, values="distortion_max", index=["algorithm"], columns=["k"])
    df_distortions = df_distortions.rename_axis(None, axis=0).rename_axis(None, axis=1)
    display(df_distortions)
    
def add_combined_mean_std(df: pd.DataFrame, attr: str):
    def combiner(row) -> str:
        mean = row[f"{attr}_mean"]
        std =  row[f"{attr}_std"]
        return f"{mean:0.2f} ({std:0.3f})"
    df[f"{attr}_mean_std"] = df.apply(lambda x: combiner(x), axis=1)

In [4]:
data_results_dir = Path("../data/experiments-skadi/")
run_files = list(data_results_dir.glob("**/*.json"))

In [5]:
len(run_files)

9445

In [6]:
run_files_filtered = [f for f in run_files if len(list(f.parent.glob("*.txt"))) > 0]

In [7]:
len(run_files_filtered)

9445

In [8]:
df_cost_data = get_costs(run_files_filtered)

In [9]:
include_datasets = [
    'Caltech', 'Caltech+PCA', 'Census', 'Census+PCA', 'Covertype', 'Covertype+PCA', 
    'Tower', 'NYTimes', 'NYTimes+PCA', 
    # 'Benchmark-1.0', 'Benchmark-1.5', 'Benchmark-2.0',
    'Benchmark',
]
include_algorithms = [
    'StreamKM++', 'Group Sampling', 'Ray Maker', 'BICO', 'Sensitivity Sampling', 
    # 'StreamKM++ (buggy)'
]
df_cost_filtered = df_cost_data[df_cost_data.algorithm.isin(include_algorithms) & df_cost_data.dataset.isin(include_datasets)]
#df_cost_filtered = df_cost_data[df_cost_data.algorithm.isin(include_algorithms)]

In [10]:
df_aggr_costs = aggregate_costs(df_cost_filtered)

In [11]:
colors = sns.color_palette("tab10")
algorithm_colors = {algo: colors[i] for i, algo in enumerate(df_aggr_costs.algorithm.unique())}

In [12]:
df_aggr_costs["k_formatted"] = "$k$=" + df_aggr_costs["k"].astype(str)

In [13]:
df_experiment_counts = pd.pivot_table(
    data=df_aggr_costs,
    values="experiment_count",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
)

df_experiment_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,10.0,10.0,10.0,10.0,10.0
Benchmark,10,100,10.0,10.0,10.0,10.0,10.0
Benchmark,10,200,10.0,10.0,10.0,10.0,10.0
Benchmark,10,500,10.0,10.0,10.0,10.0,
Benchmark,20,50,10.0,10.0,10.0,10.0,10.0
...,...,...,...,...,...,...,...
Tower,80,500,10.0,10.0,10.0,10.0,
Tower,100,50,10.0,10.0,10.0,10.0,10.0
Tower,100,100,10.0,10.0,10.0,10.0,10.0
Tower,100,200,10.0,10.0,10.0,10.0,10.0


In [14]:
df_experiment_counts.to_excel("experiment-counts.xlsx")

In [15]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)
df_results_table.to_excel("distortions.xlsx")
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,3.40 (0.440),1.02 (0.010),5.05 (0.157),1.02 (0.005),1.07 (0.005)
Benchmark,10,100,3.24 (0.729),1.01 (0.004),3.84 (0.081),1.01 (0.003),1.05 (0.004)
Benchmark,10,200,2.90 (0.153),1.01 (0.002),3.48 (0.052),1.01 (0.002),1.04 (0.002)
Benchmark,10,500,2.62 (0.095),1.01 (0.001),3.40 (0.058),1.00 (0.001),
Benchmark,20,50,3.22 (0.160),1.04 (0.004),5.52 (0.266),1.02 (0.003),1.08 (0.006)
...,...,...,...,...,...,...,...
Tower,80,500,1.02 (0.001),1.02 (0.003),1.02 (0.001),1.01 (0.004),
Tower,100,50,1.19 (0.007),1.10 (0.009),1.45 (0.010),1.03 (0.008),1.05 (0.002)
Tower,100,100,1.10 (0.008),1.06 (0.007),1.12 (0.005),1.02 (0.007),1.03 (0.001)
Tower,100,200,1.04 (0.001),1.03 (0.005),1.05 (0.002),1.01 (0.003),1.02 (0.000)


In [16]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion_synthetic")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_synthetic_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)
df_results_table.to_excel("distortions-meb-clustering.xlsx")
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,100,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,200,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,500,nan (nan),nan (nan),nan (nan),nan (nan),
Benchmark,20,50,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
...,...,...,...,...,...,...,...
Tower,80,500,1.00 (0.000),1.03 (0.001),1.00 (0.000),1.01 (0.006),
Tower,100,50,1.00 (0.000),1.03 (0.007),1.19 (0.015),1.06 (0.012),1.00 (0.000)
Tower,100,100,1.00 (0.000),1.03 (0.006),1.05 (0.004),1.04 (0.011),1.00 (0.000)
Tower,100,200,1.00 (0.000),1.03 (0.003),1.02 (0.002),1.03 (0.007),1.00 (0.000)


In [17]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion_synthetic2")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_synthetic2_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)
df_results_table.to_excel("distortions-convex-clustering.xlsx")
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,100,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,200,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,500,nan (nan),nan (nan),nan (nan),nan (nan),
Benchmark,20,50,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
...,...,...,...,...,...,...,...
Tower,80,500,1.00 (0.001),1.02 (0.005),1.01 (0.001),1.02 (0.008),
Tower,100,50,1.04 (0.008),1.03 (0.016),1.26 (0.016),1.08 (0.021),1.01 (0.002)
Tower,100,100,1.02 (0.003),1.02 (0.007),1.07 (0.004),1.05 (0.014),1.01 (0.001)
Tower,100,200,1.01 (0.003),1.02 (0.008),1.02 (0.003),1.03 (0.009),1.00 (0.001)


In [18]:
add_combined_mean_std(df=df_aggr_costs, attr="running_time")
df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="running_time_mean",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=max
)
df_results_table.to_excel("running-times.xlsx")
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,18.6,117.1,138.4,106.1,736.5
Benchmark,10,100,20.5,119.9,142.6,107.2,1460.4
Benchmark,10,200,25.4,118.3,150.8,107.9,2889.3
Benchmark,10,500,31.4,124.7,187.8,104.9,
Benchmark,20,50,104.3,1015.9,904.1,932.9,7233.1
...,...,...,...,...,...,...,...
Tower,80,500,20.1,1122.4,339.7,269.2,
Tower,100,50,6.9,1419.1,174.8,324.5,3402.7
Tower,100,100,7.5,1434.2,197.9,327.0,6859.2
Tower,100,200,10.5,1427.8,240.6,330.5,13530.2


## Generate Distortion Tables in LaTex

In [19]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)

data_sets = sorted(list(set([keys[0] for keys in df_results_table.index])))


tables_text = ""
for i, data_set in enumerate(data_sets):
    data_set_slug = data_set.replace("+", "-").lower()
    if i > 0:
        tables_text += ",\n "
    label_text = f"\\cref{{tab:distortions-mean-std-{data_set_slug}}}"
    tables_text += label_text
    
print(f"The tables \n({tables_text})\n show the distortions of the 5 evaluated algorithms on the different data sets.")
print(f"We vary the coreset size $T$ for different $k$ values using the formula: $T=mk$ where $m = \\{{50, 100, 200, 500\\}}$.")
print(f"The running time for StreamKM++ with coreset size $T=500k$ exceeds the allocated time budget of 12 hours on almost all data sets.")
print("For this reason, the distortions for StreamKM++ with $m=500$ are excluded.")
print(f"\n\n")
    
for data_set in data_sets:
    data_set_without_pca = data_set.replace("+PCA", "")
    pca_tag = " (with PCA preprocessing)" if "PCA" in data_set else ""
    data_set_text = f"\\textit{{{data_set_without_pca}}} data set{pca_tag}"
    
    top_text = r"\multicolumn{7}{c}{\textbf{Distortions on the " + data_set_text + r"}} \\"
    
    output = df_results_table.loc[data_set].to_latex()
    output = output.replace(r"\begin{tabular}{lllllll}", r"\begin{longtable}{lllllll}" + "\n" +top_text)
    output = output.replace(r"& algorithm &", r"&  &")
    output = output.replace(r"k & coreset\_size\_factor &               &                &               &                      &               \\", "")
    output = output.replace(r"k & coreset\_size\_factor &                &                &                &                      &               \\", "")
    output = output.replace(r"   &  &          ", r"k & m &")
    output = output.replace(r"k &", r"\parbox[t]{10mm}{\ \\$k$} &")
    output = output.replace(r"m &", r"\parbox[t]{10mm}{\ \\$m$} &")
    
    output = output.replace(r"Group Sampling", r"\parbox[t]{1cm}{Group\\Sampling}")
    output = output.replace(r"     Ray Maker ", r"\parbox[t]{1cm}{Ray\\Maker}")
    output = output.replace(r" Sensitivity Sampling ", r"\parbox[t]{1cm}{Sensitivity\\Sampling}")
    
    output = output.replace(r"NaN", r"")
    data_set_slug = data_set.replace("+", "-").lower()
    label_text = f"\\label{{tab:distortions-mean-std-{data_set_slug}}}"
    
    caption_text = f"\caption{{Distortions of the evaluated algorithms on the {data_set_text}. Each cell specify the mean distortion along with the standard deviation in parenthesis of 10 repetitions of the experiment.}}"
    output = output.replace(r"\end{tabular}", f"{caption_text}\n{label_text}\n\\end{{longtable}}")
    
    for n in [10, 20, 30, 40, 50, 60, 80, 100]:
        output = output.replace(f"          \\\\\n{n} ", f"\\\\\n \\midrule\n{n} ")
    
    # Remove empty lines
    output = output.replace("\n\n", "\n")
    print(output)

The tables 
(\cref{tab:distortions-mean-std-benchmark},
 \cref{tab:distortions-mean-std-caltech},
 \cref{tab:distortions-mean-std-caltech-pca},
 \cref{tab:distortions-mean-std-census},
 \cref{tab:distortions-mean-std-census-pca},
 \cref{tab:distortions-mean-std-covertype},
 \cref{tab:distortions-mean-std-covertype-pca},
 \cref{tab:distortions-mean-std-nytimes},
 \cref{tab:distortions-mean-std-nytimes-pca},
 \cref{tab:distortions-mean-std-tower})
 show the distortions of the 5 evaluated algorithms on the different data sets.
We vary the coreset size $T$ for different $k$ values using the formula: $T=mk$ where $m = \{50, 100, 200, 500\}$.
The running time for StreamKM++ with coreset size $T=500k$ exceeds the allocated time budget of 12 hours on almost all data sets.
For this reason, the distortions for StreamKM++ with $m=500$ are excluded.



\begin{longtable}{lllllll}
\multicolumn{7}{c}{\textbf{Distortions on the \textit{Benchmark} data set}} \\
\toprule
\parbox[t]{10mm}{\ \\$k$} & \par

## Solution Generation Methods

In [20]:
data_set = "Caltech"
algorithm_name = "BICO"
df_filtered = df_aggr_costs[(df_aggr_costs.dataset == data_set) & (df_aggr_costs.algorithm == algorithm_name)]
df_filtered = df_filtered.rename(columns={
    "coreset_size_factor": "m",
    "distortion_mean_std": "k-means++",
    "distortion_synthetic2_mean_std": "Random CH",
    "distortion_synthetic_mean_std": "Random MEB",
})
#df_filtered = df_filtered.groupby(["dataset", "algorithm", "k"]).count()
cols = ["k", "m", "k-means++", "Random CH", "Random MEB"]

output = df_filtered[cols].to_latex(index=False)

data_set_without_pca = data_set.replace("+PCA", "")
pca_tag = " (with PCA preprocessing)" if "PCA" in data_set else ""
data_set_text = f"\\textit{{{data_set_without_pca}}} data set{pca_tag}"
top_text = r"\multicolumn{5}{c}{\textbf{Distortions of "+algorithm_name+" on the "+data_set_text+r"}} \\"


output = output.replace(r"\begin{tabular}{rrlll}", r"\begin{longtable}{rrlll}" + "\n" + top_text)

output = re.sub(r"^(\d0) &  ", r"\1  &  ", output, flags=re.MULTILINE)
output = re.sub(r"^(\d{2}) & ", r"    & ", output, flags=re.MULTILINE)

output = re.sub(r"\\\\$(\n\d{2})", r"\\\\\n\\midrule\1", output, flags=re.MULTILINE)


output = output.replace("k ", "$k$ ")
output = output.replace("k-", "$k$-")
output = output.replace(" m ", " $m$ ")

caption_text = """\caption{The effect of different solution generation approaches on the distortions (see \cref{sec:candidate-solution-generation}).
The distortions are obtained by running BICO on the \\textit{Caltech} data set.
}"""


label_postfix = f"{data_set.lower()}-{algorithm_name.lower()}"
end_texts = [
    caption_text,
    r"\label{tab:comparison-solution-generation-"+label_postfix+"}",
    r"\end{longtable}",
]

output = output.replace(r"\end{tabular}", "\n".join(end_texts))
print(output)

\begin{longtable}{rrlll}
\multicolumn{5}{c}{\textbf{Distortions of BICO on the \textit{Caltech} data set}} \\
\toprule
 $k$ &   $m$ &    $k$-means++ &    Random CH &   Random MEB \\
\midrule
10  &  50 & 5.12 (0.307) & 4.30 (0.270) & 2.63 (0.166) \\
    & 100 & 4.48 (0.284) & 3.91 (0.240) & 2.54 (0.125) \\
    & 200 & 4.08 (0.319) & 3.54 (0.236) & 2.46 (0.134) \\
    & 500 & 3.41 (0.215) & 3.06 (0.155) & 2.18 (0.080) \\
\midrule
20  &  50 & 6.35 (1.173) & 4.73 (0.632) & 2.63 (0.147) \\
    & 100 & 4.65 (0.283) & 3.82 (0.245) & 2.41 (0.104) \\
    & 200 & 4.19 (0.384) & 3.49 (0.271) & 2.26 (0.141) \\
    & 500 & 3.50 (0.404) & 3.07 (0.301) & 2.10 (0.128) \\
\midrule
30  &  50 & 6.01 (0.335) & 4.32 (0.204) & 2.57 (0.096) \\
    & 100 & 5.10 (0.628) & 3.89 (0.340) & 2.35 (0.125) \\
    & 200 & 4.29 (0.659) & 3.47 (0.440) & 2.21 (0.170) \\
    & 500 & 3.09 (0.138) & 2.69 (0.122) & 1.87 (0.064) \\
\midrule
40  &  50 & 6.24 (0.524) & 4.33 (0.228) & 2.44 (0.076) \\
    & 100 & 5.23 (0.874) & 3

## Comparing Solution Generation

In [21]:
df_filtered = df_aggr_costs[df_aggr_costs.dataset.isin(["Benchmark", "NYTimes+PCA"]) == False].copy()
df_filtered = df_filtered.rename(columns={
    "coreset_size_factor": "m",
    "distortion_mean": "KM",
    "distortion_synthetic2_mean": "CH",
    "distortion_synthetic_mean": "MEB",
})
cols = ["dataset", "algorithm", "k", "m", "KM", "CH", "MEB"]
df_filtered = df_filtered[cols]

df_max_distortions = df_filtered.groupby(["dataset", "algorithm"])[["KM", "CH", "MEB"]].max()

In [22]:
output = df_max_distortions.to_latex()

output = output.replace(r"\begin{tabular}{llrrr}", r"\begin{longtable}{llrrr}")

output = output.replace(r"      &            &         KM &         CH &        MEB \\", r" Data set  & Algorithm     &         KM &         CH &        MEB \\")
output = output.replace(r"dataset & algorithm &            &            &            \\", r"")
output = output.replace(r" KM ", r"$k$-means++")
output = output.replace(r" CH ", r"Random CH")
output = output.replace(r" MEB ", r"Random MEB")
# output = output.replace(r"", r"")

# output = output.replace(r"", r"")


output = re.sub(r"( \\\\\n)([A-Z])", r"\1\\midrule\n\2", output, flags=re.MULTILINE)

caption_text = """\caption{The effect of different solution generation approaches on the distortions (see \cref{sec:evaluation-procedure}).
The distortions are aggregated across different algorithms and data sets.
}"""

end_texts = [
    caption_text,
    r"\label{tab:comparison-solution-generation-all}",
    r"\end{longtable}",
]

output = output.replace(r"\end{tabular}", "\n".join(end_texts))

# Remove empty lines
output = output.replace("\n\n", "\n")
    
print(output)

\begin{longtable}{llrrr}
\toprule
 Data set  & Algorithm     &        $k$-means++&        Random CH&       Random MEB\\
\midrule
Caltech & BICO &   7.499068 &   4.800188 &   2.629218 \\
      & Group Sampling &   1.068441 &   1.035997 &   1.006828 \\
      & Ray Maker &   7.053856 &   5.442665 &   3.453930 \\
      & Sensitivity Sampling &   1.048977 &   1.016564 &   1.029646 \\
      & StreamKM++ &   1.202426 &   1.111698 &   1.012216 \\
\midrule
Caltech+PCA & BICO &   2.862242 &   2.411732 &   1.825558 \\
      & Group Sampling &   1.058707 &   1.021386 &   1.007735 \\
      & Ray Maker &   2.941248 &   2.563131 &   2.053860 \\
      & Sensitivity Sampling &   1.040106 &   1.012539 &   1.032113 \\
      & StreamKM++ &   1.139141 &   1.073384 &   1.013089 \\
\midrule
Census & BICO &   2.502564 &   1.441889 &   1.040356 \\
      & Group Sampling &   1.077525 &   1.032349 &   1.028025 \\
      & Ray Maker &   2.483973 &   1.633945 &   1.236878 \\
      & Sensitivity Sampling &   1.03297