In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import re

from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

from IPython.core.display import HTML
from sklearn.metrics import pairwise_distances

from xrun.data.run_info import RunInfo

In [3]:
def aggregate_costs(df_costs: pd.DataFrame):
    group_columns = ["dataset", "algorithm", "k", "coreset_size", "coreset_size_factor"]
    df_top_k = df_costs.groupby(group_columns, as_index=False).head(10)
    df_aggr_costs = df_top_k.groupby(group_columns, as_index=False).agg(
        experiment_count=("algorithm", "count"),
        real_cost_mean=("real_cost", "mean"),
        real_cost_std=("real_cost", "std"),
        coreset_cost_mean=("coreset_cost", "mean"),
        coreset_cost_std=("coreset_cost", "std"),
        running_time_mean=("running_time", "mean"),
        running_time_std=("running_time", "std"),
        distortion_mean=("distortion", "mean"),
        distortion_median=("distortion", "median"),
        distortion_std=("distortion", "std"),
        distortion_max=("distortion", "max"),
        distortion_synthetic_mean=("distortion_synthetic", "mean"),
        distortion_synthetic_std=("distortion_synthetic", "std"),
        distortion_synthetic2_mean=("distortion_synthetic2", "mean"),
        distortion_synthetic2_std=("distortion_synthetic2", "std"),
    )
    df_aggr_costs["running_time"] = pd.to_timedelta(df_aggr_costs.running_time_mean, unit='s')
    df_aggr_costs["running_time_formatted"] = df_aggr_costs["running_time"].map(
        lambda x: f"{x.components.hours:02d}h {x.components.minutes:02d}m {x.components.seconds:02d}s"
    )
    return df_aggr_costs

def add_combined_mean_std(df: pd.DataFrame, attr: str, g_format: bool=False):
    def combiner(row) -> str:
        mean = row[f"{attr}_mean"]
        std =  row[f"{attr}_std"]
        if g_format:
            return f"{mean:.1e} ({std:.1e})"
        else:
            return f"{mean:0.2f} ({std:0.3f})"
    df[f"{attr}_mean_std"] = df.apply(lambda x: combiner(x), axis=1)

In [4]:
df_cost_filtered = pd.read_csv("../data/results-raw.csv")

In [5]:
df_aggr_costs = aggregate_costs(df_cost_filtered)
df_aggr_costs["k_formatted"] = "$k$=" + df_aggr_costs["k"].astype(str)

In [6]:
colors = sns.color_palette("tab10")
algorithm_colors = {algo: colors[i] for i, algo in enumerate(df_aggr_costs.algorithm.unique())}

## Experiment Counts

In [7]:
df_experiment_counts = pd.pivot_table(
    data=df_aggr_costs,
    values="experiment_count",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
)

df_experiment_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,10.0,10.0,10.0,10.0,10.0
Benchmark,10,100,10.0,10.0,10.0,10.0,10.0
Benchmark,10,200,10.0,10.0,10.0,10.0,10.0
Benchmark,10,500,10.0,10.0,10.0,10.0,
Benchmark,20,50,10.0,10.0,10.0,10.0,10.0
...,...,...,...,...,...,...,...
Tower,80,500,10.0,10.0,10.0,10.0,
Tower,100,50,10.0,10.0,10.0,10.0,10.0
Tower,100,100,10.0,10.0,10.0,10.0,10.0
Tower,100,200,10.0,10.0,10.0,10.0,10.0


## Distortions (k-means++)

In [8]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)
# df_results_table.to_excel("distortions.xlsx")
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,3.40 (0.440),1.02 (0.010),5.05 (0.157),1.02 (0.005),1.07 (0.005)
Benchmark,10,100,3.24 (0.729),1.01 (0.004),3.84 (0.081),1.01 (0.003),1.05 (0.004)
Benchmark,10,200,2.90 (0.153),1.01 (0.002),3.48 (0.052),1.01 (0.002),1.04 (0.002)
Benchmark,10,500,2.62 (0.095),1.01 (0.001),3.40 (0.058),1.00 (0.001),
Benchmark,20,50,3.22 (0.160),1.04 (0.004),5.52 (0.266),1.02 (0.003),1.08 (0.006)
...,...,...,...,...,...,...,...
Tower,80,500,1.02 (0.001),1.02 (0.003),1.02 (0.001),1.01 (0.004),
Tower,100,50,1.19 (0.007),1.10 (0.009),1.45 (0.010),1.03 (0.008),1.05 (0.002)
Tower,100,100,1.10 (0.008),1.06 (0.007),1.12 (0.005),1.02 (0.007),1.03 (0.001)
Tower,100,200,1.04 (0.001),1.03 (0.005),1.05 (0.002),1.01 (0.003),1.02 (0.000)


## Distortions (MEB)

## Distortions (Convex Hull)

In [9]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion_synthetic2")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_synthetic2_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)
# df_results_table.to_excel("distortions-convex-clustering.xlsx")
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,100,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,200,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
Benchmark,10,500,nan (nan),nan (nan),nan (nan),nan (nan),
Benchmark,20,50,nan (nan),nan (nan),nan (nan),nan (nan),nan (nan)
...,...,...,...,...,...,...,...
Tower,80,500,1.00 (0.001),1.02 (0.005),1.01 (0.001),1.02 (0.008),
Tower,100,50,1.04 (0.008),1.03 (0.016),1.26 (0.016),1.08 (0.021),1.01 (0.002)
Tower,100,100,1.02 (0.003),1.02 (0.007),1.07 (0.004),1.05 (0.014),1.01 (0.001)
Tower,100,200,1.01 (0.003),1.02 (0.008),1.02 (0.003),1.03 (0.009),1.00 (0.001)


## Running time

In [10]:
df_aggr_costs["running_time_mean"] = df_aggr_costs["running_time_mean"].round(0).astype(int)

def runtime_combiner(row) -> str:
    attr = "running_time"
    mean = row[f"{attr}_mean"]
    std =  row[f"{attr}_std"]
    return f"{mean:0.0f} ({std:0.0f})"
df_aggr_costs["running_time_mean_std"] = df_aggr_costs.apply(lambda x: runtime_combiner(x), axis=1)

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="running_time_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=max
)
df_results_table = df_results_table.round(0)


data_sets = sorted(list(set([keys[0] for keys in df_results_table.index])))
    
for data_set in data_sets:
    data_set_without_pca = data_set.replace("+PCA", "")
    pca_tag = " (with PCA preprocessing)" if "PCA" in data_set else ""
    data_set_text = f"\\textit{{{data_set_without_pca}}} data set{pca_tag}"
    
    top_text = r"\multicolumn{7}{c}{\textbf{Running times on the " + data_set_text + r"}} \\"
    
    output = df_results_table.loc[data_set].to_latex()
    output = output.replace(r"\begin{tabular}{lllllll}", r"\begin{longtable}{llrrrrr}" + "\n" +top_text)
    output = output.replace(r"& algorithm &", r"&  &")
    output = re.sub(r"^k & [^&]+[^\\]+\\\\", r"", output, flags=re.MULTILINE)
    
    output = output.replace(r"   &  & ", r"k & m &")
    output = output.replace(r"k &", r"\parbox[t]{5mm}{\ \\$k$} &")
    output = output.replace(r"m &", r"\parbox[t]{5mm}{\ \\$m$} &")
    
    output = output.replace(r"Group Sampling", r" \parbox[t]{1.5cm}{Group\\Sampling} ")
    output = output.replace(r"     Ray Maker ", r" \parbox[t]{1.5cm}{Ray\\Maker} ")
    output = output.replace(r" Sensitivity Sampling ", r" \parbox[t]{1.5cm}{Sensitivity\\Sampling} ")
    
    output = output.replace(r"NaN", r"")
    data_set_slug = data_set.replace("+", "-").lower()
    label_text = f"\\label{{tab:running-time-mean-{data_set_slug}}}"
    
    caption_text = f"\caption{{Running times (in seconds) of the evaluated algorithms on the {data_set_text}. Each cell specify the mean along with the standard deviation in parenthesis of 10 repetitions of the experiment.}}"
    output = output.replace(r"\end{tabular}", f"{caption_text}\n{label_text}\n\\end{{longtable}}")
    
    for n in [10, 20, 30, 40, 50, 60, 80, 100]:
        output = output.replace(f"          \\\\\n{n} ", f"\\\\\n \\midrule\n{n} ")
    
    # Remove empty lines
    output = output.replace("\n\n", "\n")
    print(output)


\begin{longtable}{llrrrrr}
\multicolumn{7}{c}{\textbf{Running times on the \textit{Benchmark} data set}} \\
\toprule
\parbox[t]{5mm}{\ \\$k$} & \parbox[t]{5mm}{\ \\$m$} &     BICO &  \parbox[t]{1.5cm}{Group\\Sampling}  &   Ray Maker & \parbox[t]{1.5cm}{Sensitivity\\Sampling} &    StreamKM++ \\
\midrule
10 & 50  &    19 (3) &       117 (10) &    138 (15) &             106 (10) &      736 (60) \\
   & 100 &    20 (4) &        120 (7) &    143 (17) &              107 (8) &     1460 (87) \\
   & 200 &    25 (3) &       118 (10) &    151 (19) &              108 (8) &    2889 (139) \\
   & 500 &    31 (7) &        125 (9) &    188 (10) &             105 (14) &  \\
 \midrule
20 & 50  &   104 (7) &      1016 (74) &    904 (68) &             933 (81) &    7233 (555) \\
   & 100 &  124 (14) &      1021 (77) &    918 (72) &            913 (112) &   14546 (528) \\
   & 200 &  176 (12) &      1004 (73) &    960 (77) &             933 (89) &  28532 (1115) \\
   & 500 &  298 (52) &      1014 (54) &  

  output = df_results_table.loc[data_set].to_latex()


## Cost Ratios

In [11]:
df_filtered = df_aggr_costs

In [12]:
df_min_max = df_filtered.groupby(["dataset", "k", "algorithm"]).agg(
    real_cost_min=("real_cost_mean", "min"),
    real_cost_max=("real_cost_mean", "max"),
)
df_min_max["real_cost_ratio"] = df_min_max.real_cost_max / df_min_max.real_cost_min
df_min_max

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,real_cost_min,real_cost_max,real_cost_ratio
dataset,k,algorithm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Benchmark,10,BICO,4.580000e+06,4.760000e+06,1.039301
Benchmark,10,Group Sampling,5.220000e+06,5.400000e+06,1.034483
Benchmark,10,Ray Maker,4.940000e+06,5.060000e+06,1.024291
Benchmark,10,Sensitivity Sampling,4.540000e+06,4.720000e+06,1.039648
Benchmark,10,StreamKM++,5.300000e+06,5.300000e+06,1.000000
...,...,...,...,...,...
Tower,100,BICO,1.625935e+08,1.700597e+08,1.045920
Tower,100,Group Sampling,1.650037e+08,1.720751e+08,1.042856
Tower,100,Ray Maker,1.630296e+08,1.702870e+08,1.044516
Tower,100,Sensitivity Sampling,1.657411e+08,1.737201e+08,1.048141


## Real Costs

In [13]:
add_combined_mean_std(df=df_aggr_costs, attr="real_cost")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="real_cost_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)
df_results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,BICO,Group Sampling,Ray Maker,Sensitivity Sampling,StreamKM++
dataset,k,coreset_size_factor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark,10,50,4760000.00 (231900.362),5220000.00 (413118.224),5000000.00 (286744.176),4680000.00 (382390.144),5300000.00 (0.000)
Benchmark,10,100,4580000.00 (168654.809),5280000.00 (113529.242),4940000.00 (279682.360),4540000.00 (126491.106),5300000.00 (0.000)
Benchmark,10,200,4660000.00 (309838.668),5400000.00 (235702.260),5060000.00 (362705.880),4620000.00 (252982.213),5300000.00 (0.000)
Benchmark,10,500,4580000.00 (103279.556),5400000.00 (141421.356),5060000.00 (206559.112),4720000.00 (358391.468),
Benchmark,20,50,13856000.00 (880242.391),15040000.00 (261278.906),13344000.00 (622882.190),12160000.00 (0.000),15616000.00 (294059.707)
...,...,...,...,...,...,...,...
Tower,80,500,192012282.54 (1201485.655),195082487.34 (2445797.309),192751126.49 (1702342.170),194969699.20 (2183184.955),
Tower,100,50,170059736.03 (2066303.750),172075072.46 (2640572.896),170286962.21 (1529146.680),173720053.43 (1463438.260),168025582.70 (1959350.573)
Tower,100,100,167031858.12 (1498830.681),169348242.03 (1962515.302),165742092.42 (1407554.809),170492608.55 (1740176.624),165441005.85 (916201.077)
Tower,100,200,164989412.98 (1187492.162),166943959.68 (1261164.149),163900317.70 (1224269.502),167247598.30 (1772989.545),164073694.57 (775504.865)


## Generating Costs Table

In [14]:

add_combined_mean_std(df=df_aggr_costs, attr="real_cost", g_format=True)

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="real_cost_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)


data_sets = sorted(list(set([keys[0] for keys in df_results_table.index])))
    
for data_set in data_sets:
    data_set_without_pca = data_set.replace("+PCA", "")
    pca_tag = " (with PCA preprocessing)" if "PCA" in data_set else ""
    data_set_text = f"\\textit{{{data_set_without_pca}}} data set{pca_tag}"
    
    top_text = r"\multicolumn{7}{c}{\textbf{Costs on the " + data_set_text + r"}} \\"
    
    output = df_results_table.loc[data_set].to_latex()
    output = output.replace(r"\begin{tabular}{lllllll}", r"\begin{longtable}{lllllll}" + "\n" +top_text)
    output = output.replace(r"& algorithm &", r"&  &")
    output = output.replace(r"k & coreset\_size\_factor &               &                &               &                      &               \\", "")
    output = output.replace(r"k & coreset\_size\_factor &                &                &                &                      &               \\", "")
    output = output.replace(r"k & coreset\_size\_factor &                    &                    &                    &                      &                    \\", "")
    output = output.replace(r"   &  &          ", r"k & m &")
    output = output.replace(r"k &", r"\parbox[t]{5mm}{\ \\$k$} &")
    output = output.replace(r"m &", r"\parbox[t]{5mm}{\ \\$m$} &")
    
    output = output.replace(r"Group Sampling", r"\parbox[t]{1cm}{Group\\Sampling}")
    output = output.replace(r"     Ray Maker ", r"\parbox[t]{1cm}{Ray\\Maker}")
    output = output.replace(r" Sensitivity Sampling ", r"\parbox[t]{1cm}{Sensitivity\\Sampling}")
    
    output = output.replace(r"NaN", r"")
    data_set_slug = data_set.replace("+", "-").lower()
    label_text = f"\\label{{tab:real-cost-mean-std-{data_set_slug}}}"
    
    caption_text = f"\caption{{Costs of the evaluated algorithms on the {data_set_text}. Each cell specify the mean along with the standard deviation in parenthesis of 10 repetitions of the experiment.}}"
    output = output.replace(r"\end{tabular}", f"{caption_text}\n{label_text}\n\\end{{longtable}}")
    
    for n in [10, 20, 30, 40, 50, 60, 80, 100]:
        output = output.replace(f"          \\\\\n{n} ", f"\\\\\n \\midrule\n{n} ")
    
    # Remove empty lines
    output = output.replace("\n\n", "\n")
    print(output)


\begin{longtable}{lllllll}
\multicolumn{7}{c}{\textbf{Costs on the \textit{Benchmark} data set}} \\
\toprule
\parbox[t]{5mm}{\ \\$k$} & \parbox[t]{5mm}{\ \\$m$} &     BICO &     \parbox[t]{1cm}{Group\\Sampling} &     \parbox[t]{1cm}{Ray\\Maker}&\parbox[t]{1cm}{Sensitivity\\Sampling}&         StreamKM++ \\
\midrule
10 & 50  &  4.8e+06 (2.3e+05) &  5.2e+06 (4.1e+05) &  5.0e+06 (2.9e+05) &    4.7e+06 (3.8e+05) &  5.3e+06 (0.0e+00) \\
   & 100 &  4.6e+06 (1.7e+05) &  5.3e+06 (1.1e+05) &  4.9e+06 (2.8e+05) &    4.5e+06 (1.3e+05) &  5.3e+06 (0.0e+00) \\
   & 200 &  4.7e+06 (3.1e+05) &  5.4e+06 (2.4e+05) &  5.1e+06 (3.6e+05) &    4.6e+06 (2.5e+05) &  5.3e+06 (0.0e+00) \\
   & 500 &  4.6e+06 (1.0e+05) &  5.4e+06 (1.4e+05) &  5.1e+06 (2.1e+05) &    4.7e+06 (3.6e+05) &       \\
 \midrule
20 & 50  &  1.4e+07 (8.8e+05) &  1.5e+07 (2.6e+05) &  1.3e+07 (6.2e+05) &    1.2e+07 (1.9e-09) &  1.6e+07 (2.9e+05) \\
   & 100 &  1.3e+07 (9.8e+05) &  1.5e+07 (4.6e+05) &  1.4e+07 (5.9e+05) &    1.3e+07 (1.1e+0

  output = df_results_table.loc[data_set].to_latex()


## Generate Distortion Tables in LaTex

## Distortion Tables

In [15]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion")

df_results_table = pd.pivot_table(
    data=df_aggr_costs,
    values="distortion_mean_std",
    index=["dataset", "k", "coreset_size_factor"],
    columns=["algorithm"],
    aggfunc=lambda x: ' '.join(x)
)

data_sets = sorted(list(set([keys[0] for keys in df_results_table.index])))


tables_text = ""
for i, data_set in enumerate(data_sets):
    data_set_slug = data_set.replace("+", "-").lower()
    if i > 0:
        tables_text += ",\n "
    label_text = f"\\cref{{tab:distortions-mean-std-{data_set_slug}}}"
    tables_text += label_text
    
print(f"The tables \n({tables_text})\n show the distortions of the 5 evaluated algorithms on the different data sets.")
print(f"We vary the coreset size $T$ for different $k$ values using the formula: $T=mk$ where $m = \\{{50, 100, 200, 500\\}}$.")
print(f"The running time for StreamKM++ with coreset size $T=500k$ exceeds the allocated time budget of 12 hours on almost all data sets.")
print("For this reason, the distortions for StreamKM++ with $m=500$ are excluded.")
print(f"\n\n")
    
for data_set in data_sets:
    data_set_without_pca = data_set.replace("+PCA", "")
    pca_tag = " (with PCA preprocessing)" if "PCA" in data_set else ""
    data_set_text = f"\\textit{{{data_set_without_pca}}} data set{pca_tag}"
    
    top_text = r"\multicolumn{7}{c}{\textbf{Distortions on the " + data_set_text + r"}} \\"
    
    output = df_results_table.loc[data_set].to_latex()
    output = output.replace(r"\begin{tabular}{lllllll}", r"\begin{longtable}{lllllll}" + "\n" +top_text)
    output = output.replace(r"& algorithm &", r"&  &")
    output = output.replace(r"k & coreset\_size\_factor &               &                &               &                      &               \\", "")
    output = output.replace(r"k & coreset\_size\_factor &                &                &                &                      &               \\", "")
    output = output.replace(r"   &  &          ", r"k & m &")
    output = output.replace(r"k &", r"\parbox[t]{10mm}{\ \\$k$} &")
    output = output.replace(r"m &", r"\parbox[t]{10mm}{\ \\$m$} &")
    
    output = output.replace(r"Group Sampling", r"\parbox[t]{1cm}{Group\\Sampling}")
    output = output.replace(r"     Ray Maker ", r"\parbox[t]{1cm}{Ray\\Maker}")
    output = output.replace(r" Sensitivity Sampling ", r"\parbox[t]{1cm}{Sensitivity\\Sampling}")
    
    output = output.replace(r"NaN", r"")
    data_set_slug = data_set.replace("+", "-").lower()
    label_text = f"\\label{{tab:distortions-mean-std-{data_set_slug}}}"
    
    caption_text = f"\caption{{Distortions of the evaluated algorithms on the {data_set_text}. Each cell specify the mean distortion along with the standard deviation in parenthesis of 10 repetitions of the experiment.}}"
    output = output.replace(r"\end{tabular}", f"{caption_text}\n{label_text}\n\\end{{longtable}}")
    
    for n in [10, 20, 30, 40, 50, 60, 80, 100]:
        output = output.replace(f"          \\\\\n{n} ", f"\\\\\n \\midrule\n{n} ")
    
    # Remove empty lines
    output = output.replace("\n\n", "\n")
    print(output)

The tables 
(\cref{tab:distortions-mean-std-benchmark},
 \cref{tab:distortions-mean-std-caltech},
 \cref{tab:distortions-mean-std-caltech-pca},
 \cref{tab:distortions-mean-std-census},
 \cref{tab:distortions-mean-std-census-pca},
 \cref{tab:distortions-mean-std-covertype},
 \cref{tab:distortions-mean-std-covertype-pca},
 \cref{tab:distortions-mean-std-nytimes},
 \cref{tab:distortions-mean-std-nytimes-pca},
 \cref{tab:distortions-mean-std-tower})
 show the distortions of the 5 evaluated algorithms on the different data sets.
We vary the coreset size $T$ for different $k$ values using the formula: $T=mk$ where $m = \{50, 100, 200, 500\}$.
The running time for StreamKM++ with coreset size $T=500k$ exceeds the allocated time budget of 12 hours on almost all data sets.
For this reason, the distortions for StreamKM++ with $m=500$ are excluded.



\begin{longtable}{lllllll}
\multicolumn{7}{c}{\textbf{Distortions on the \textit{Benchmark} data set}} \\
\toprule
\parbox[t]{10mm}{\ \\$k$} & \par

  output = df_results_table.loc[data_set].to_latex()


## Solution Generation Methods

## Candidate Solution Generators

In [16]:
add_combined_mean_std(df=df_aggr_costs, attr="distortion_synthetic")
add_combined_mean_std(df=df_aggr_costs, attr="distortion_synthetic2")


data_set = "Caltech"
algorithm_name = "BICO"
df_filtered = df_aggr_costs[(df_aggr_costs.dataset == data_set) & (df_aggr_costs.algorithm == algorithm_name)]
df_filtered = df_filtered.rename(columns={
    "coreset_size_factor": "m",
    "distortion_mean_std": "k-means++",
    "distortion_synthetic2_mean_std": "Random CH",
    "distortion_synthetic_mean_std": "Random MEB",
})
#df_filtered = df_filtered.groupby(["dataset", "algorithm", "k"]).count()
cols = ["k", "m", "k-means++", "Random CH", "Random MEB"]

output = df_filtered[cols].to_latex(index=False)

data_set_without_pca = data_set.replace("+PCA", "")
pca_tag = " (with PCA preprocessing)" if "PCA" in data_set else ""
data_set_text = f"\\textit{{{data_set_without_pca}}} data set{pca_tag}"
top_text = r"\multicolumn{5}{c}{\textbf{Distortions of "+algorithm_name+" on the "+data_set_text+r"}} \\"


output = output.replace(r"\begin{tabular}{rrlll}", r"\begin{longtable}{rrlll}" + "\n" + top_text)

output = re.sub(r"^(\d0) &  ", r"\1  &  ", output, flags=re.MULTILINE)
output = re.sub(r"^(\d{2}) & ", r"    & ", output, flags=re.MULTILINE)

output = re.sub(r"\\\\$(\n\d{2})", r"\\\\\n\\midrule\1", output, flags=re.MULTILINE)


output = output.replace("k ", "$k$ ")
output = output.replace("k-", "$k$-")
output = output.replace(" m ", " $m$ ")

caption_text = """\caption{The effect of different solution generation approaches on the distortions (see \cref{sec:candidate-solution-generation}).
The distortions are obtained by running BICO on the \\textit{Caltech} data set.
}"""


label_postfix = f"{data_set.lower()}-{algorithm_name.lower()}"
end_texts = [
    caption_text,
    r"\label{tab:comparison-solution-generation-"+label_postfix+"}",
    r"\end{longtable}",
]

output = output.replace(r"\end{tabular}", "\n".join(end_texts))
print(output)

\begin{longtable}{rrlll}
\multicolumn{5}{c}{\textbf{Distortions of BICO on the \textit{Caltech} data set}} \\
\toprule
 $k$ &   $m$ &    $k$-means++ &    Random CH &   Random MEB \\
\midrule
10  &  50 & 5.12 (0.307) & 4.30 (0.270) & 2.63 (0.166) \\
    & 100 & 4.48 (0.284) & 3.91 (0.240) & 2.54 (0.125) \\
    & 200 & 4.08 (0.319) & 3.54 (0.236) & 2.46 (0.134) \\
    & 500 & 3.41 (0.215) & 3.06 (0.155) & 2.18 (0.080) \\
\midrule
20  &  50 & 6.35 (1.173) & 4.73 (0.632) & 2.63 (0.147) \\
    & 100 & 4.65 (0.283) & 3.82 (0.245) & 2.41 (0.104) \\
    & 200 & 4.19 (0.384) & 3.49 (0.271) & 2.26 (0.141) \\
    & 500 & 3.50 (0.404) & 3.07 (0.301) & 2.10 (0.128) \\
\midrule
30  &  50 & 6.01 (0.335) & 4.32 (0.204) & 2.57 (0.096) \\
    & 100 & 5.10 (0.628) & 3.89 (0.340) & 2.35 (0.125) \\
    & 200 & 4.29 (0.659) & 3.47 (0.440) & 2.21 (0.170) \\
    & 500 & 3.09 (0.138) & 2.69 (0.122) & 1.87 (0.064) \\
\midrule
40  &  50 & 6.24 (0.524) & 4.33 (0.228) & 2.44 (0.076) \\
    & 100 & 5.23 (0.874) & 3

  output = df_filtered[cols].to_latex(index=False)


## Comparing Solution Generation

In [17]:
df_filtered = df_aggr_costs[df_aggr_costs.dataset.isin(["Benchmark", "NYTimes+PCA"]) == False].copy()
df_filtered = df_filtered.rename(columns={
    "coreset_size_factor": "m",
    "distortion_mean": "KM",
    "distortion_synthetic2_mean": "CH",
    "distortion_synthetic_mean": "MEB",
})
cols = ["dataset", "algorithm", "k", "m", "KM", "CH", "MEB"]
df_filtered = df_filtered[cols]

df_max_distortions = df_filtered.groupby(["dataset", "algorithm"])[["KM", "CH", "MEB"]].max()

In [18]:
output = df_max_distortions.to_latex()

output = output.replace(r"\begin{tabular}{llrrr}", r"\begin{longtable}{llrrr}")

output = output.replace(r"      &            &         KM &         CH &        MEB \\", r" Data set  & Algorithm     &         KM &         CH &        MEB \\")
output = output.replace(r"dataset & algorithm &            &            &            \\", r"")
output = output.replace(r" KM ", r"$k$-means++")
output = output.replace(r" CH ", r"Random CH")
output = output.replace(r" MEB ", r"Random MEB")
# output = output.replace(r"", r"")

# output = output.replace(r"", r"")


output = re.sub(r"( \\\\\n)([A-Z])", r"\1\\midrule\n\2", output, flags=re.MULTILINE)

caption_text = """\caption{The effect of different solution generation approaches on the distortions (see \cref{sec:evaluation-procedure}).
The distortions are aggregated across different algorithms and data sets.
}"""

end_texts = [
    caption_text,
    r"\label{tab:comparison-solution-generation-all}",
    r"\end{longtable}",
]

output = output.replace(r"\end{tabular}", "\n".join(end_texts))

# Remove empty lines
output = output.replace("\n\n", "\n")
    
print(output)

\begin{longtable}{llrrr}
\toprule
 Data set  & Algorithm     &        $k$-means++&        Random CH&       Random MEB\\
\midrule
Caltech & BICO &   7.499068 &   4.800188 &   2.629218 \\
      & Group Sampling &   1.068441 &   1.035997 &   1.006828 \\
      & Ray Maker &   7.053856 &   5.442665 &   3.453930 \\
      & Sensitivity Sampling &   1.048977 &   1.016564 &   1.029646 \\
      & StreamKM++ &   1.202426 &   1.111698 &   1.012216 \\
\midrule
Caltech+PCA & BICO &   2.862242 &   2.411732 &   1.825558 \\
      & Group Sampling &   1.058707 &   1.021386 &   1.007735 \\
      & Ray Maker &   2.941248 &   2.563131 &   2.053860 \\
      & Sensitivity Sampling &   1.040106 &   1.012539 &   1.032113 \\
      & StreamKM++ &   1.139141 &   1.073384 &   1.013089 \\
\midrule
Census & BICO &   2.502564 &   1.441889 &   1.040356 \\
      & Group Sampling &   1.077525 &   1.032349 &   1.028025 \\
      & Ray Maker &   2.483973 &   1.633945 &   1.236878 \\
      & Sensitivity Sampling &   1.03297

  output = df_max_distortions.to_latex()
