In [22]:
import re

from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances

In [23]:
def get_weighted_costs(file_paths):
    costs = []
    for index, cost_path in enumerate(file_paths):
        dataset, algorithm, k = re.findall(r"/.+/(.+)/(.+)-k(\d+)-", str(cost_path))[0]
        with open(cost_path, "r") as f:
            weighted_cost = float(f.read())
        costs.append({
            "dataset": dataset,
            "algorithm": algorithm,
            "k": k,
            "cost": weighted_cost,
            "file_path": str(cost_path)
        })
    return pd.DataFrame(costs)

In [24]:
data_results_dir = Path("../data/results/hardinstance/")

## Real Costs

In [25]:
real_cost_paths = list(data_results_dir.glob("**/real_cost.txt"))

In [26]:
len(real_cost_paths)

111

In [27]:
df_real_costs = get_weighted_costs(real_cost_paths)

In [28]:
df_aggr_real_cost_data = df_real_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
    cost_count=("cost", "count"),
    cost_mean=("cost", "mean"),
    cost_std=("cost", "std")
)
df_aggr_real_cost_data

Unnamed: 0,dataset,algorithm,k,cost_count,cost_mean,cost_std
0,hardinstance,bico,10,10,4852765.0,49864.529677
1,hardinstance,bico,20,10,13317750.0,107411.321838
2,hardinstance,bico,30,10,2596250.0,15255.984079
3,hardinstance,bico,40,10,8240570.0,67835.103465
4,hardinstance,bico,50,10,20245250.0,149615.975928
5,hardinstance,group-sampling,10,8,4639371.0,59324.589029
6,hardinstance,group-sampling,20,8,12856670.0,242183.267664
7,hardinstance,group-sampling,30,8,2542662.0,33421.397631
8,hardinstance,group-sampling,40,7,8143744.0,58082.61383
9,hardinstance,group-sampling,50,1,19870750.0,


### Real cost summary

In [29]:
df_real_cost_summary = pd.pivot_table(df_aggr_real_cost_data, values="cost_mean", index=["algorithm"], columns=["k"])
df_real_cost_summary#.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,4852765.0,13317750.0,2596250.0,8240570.0,20245250.0
group-sampling,4639371.0,12856670.0,2542662.0,8143744.0,19870750.0
sensitivity-sampling,4672991.0,12885240.0,2564847.0,8263263.0,20136850.0


### Real cost variances

In [30]:
df_real_cost_summary_std = pd.pivot_table(df_aggr_real_cost_data, values="cost_std", index=["algorithm"], columns=["k"])
df_real_cost_summary_std#.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,49864.529677,107411.321838,15255.984079,67835.103465,149615.975928
group-sampling,59324.589029,242183.267664,33421.397631,58082.61383,
sensitivity-sampling,34843.119526,142236.564482,31395.38262,54791.375166,


## Coreset Costs

In [15]:
coreset_cost_paths = list(data_results_dir.glob("**/coreset_cost.txt"))

In [16]:
len(coreset_cost_paths)

112

In [17]:
df_coreset_costs = get_weighted_costs(coreset_cost_paths)

In [18]:
df_aggr_coreset_cost_data = df_coreset_costs.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
    cost_count=("cost", "count"),
    cost_mean=("cost", "mean"),
    cost_std=("cost", "std")
)
df_aggr_coreset_cost_data

Unnamed: 0,dataset,algorithm,k,cost_count,cost_mean,cost_std
0,hardinstance,bico,10,11,1246969.0,75630.705419
1,hardinstance,bico,20,10,3487690.0,158875.60243
2,hardinstance,bico,30,10,795591.2,30478.274584
3,hardinstance,bico,40,10,2038664.0,123529.183592
4,hardinstance,bico,50,10,4722689.0,334296.891985
5,hardinstance,group-sampling,10,8,4587714.0,56435.294508
6,hardinstance,group-sampling,20,8,12690580.0,225976.772754
7,hardinstance,group-sampling,30,8,2511511.0,31687.153632
8,hardinstance,group-sampling,40,7,8044408.0,53043.887031
9,hardinstance,group-sampling,50,1,19636070.0,


In [20]:
df_coreset_cost_summary = pd.pivot_table(df_aggr_coreset_cost_data, values="cost_mean", index=["algorithm"], columns=["k"])
df_coreset_cost_summary

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,1246969.0,3487690.0,795591.2,2038664.0,4722689.0
group-sampling,4587714.0,12690580.0,2511511.0,8044408.0,19636070.0
sensitivity-sampling,4653270.0,12821700.0,2552491.0,8230189.0,20049030.0


In [31]:
df_coreset_cost_summary_std = pd.pivot_table(df_aggr_coreset_cost_data, values="cost_std", index=["algorithm"], columns=["k"])
df_coreset_cost_summary_std#.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,75630.705419,158875.60243,30478.274584,123529.183592,334296.891985
group-sampling,56435.294508,225976.772754,31687.153632,53043.887031,
sensitivity-sampling,31990.582913,136810.584088,30003.380505,54088.540931,


## Distortion

In [32]:
df_real_cost_summary / df_coreset_cost_summary

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,3.891648,3.818501,3.263296,4.042143,4.286806
group-sampling,1.01126,1.013088,1.012403,1.012349,1.011952
sensitivity-sampling,1.004238,1.004956,1.004841,1.004019,1.00438
