In [1]:
import re

from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances

In [2]:
def get_weighted_costs(file_paths):
    costs = []
    for index, cost_path in enumerate(file_paths):
        dataset, algorithm, k = re.findall(r"/.+/(.+)/(.+)-k(\d+)-", str(cost_path))[0]
        with open(cost_path, "r") as f:
            weighted_cost = float(f.read())
        costs.append({
            "dataset": dataset,
            "algorithm": algorithm,
            "k": k,
            "cost": weighted_cost,
            "file_path": str(cost_path)
        })
    return pd.DataFrame(costs)

In [3]:
# data_results_dir = Path("../data/results-lowd/towerlowd/")
# data_results_dir = Path("../data/results-lowd/covertypelowd/")
data_results_dir = Path("../data/results-lowd/censuslowd/")

In [4]:
cost_paths = list(data_results_dir.glob("**/real_cost.txt"))

In [5]:
len(cost_paths)

70

In [6]:
df_cost_data = get_weighted_costs(cost_paths)

In [7]:
df_aggr_cost_data = df_cost_data.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
    cost_count=("cost", "count"),
    cost_mean=("cost", "mean"),
    cost_std=("cost", "std")
)
df_aggr_cost_data

Unnamed: 0,dataset,algorithm,k,cost_count,cost_mean,cost_std
0,censuslowd,basic-clustering,10,6,244585500.0,3800538.0
1,censuslowd,basic-clustering,20,5,190738600.0,2409336.0
2,censuslowd,basic-clustering,30,4,159381900.0,2560551.0
3,censuslowd,basic-clustering,40,3,138619500.0,543573.0
4,censuslowd,basic-clustering,50,1,128190800.0,
5,censuslowd,group-sampling,10,6,247746400.0,1855137.0
6,censuslowd,group-sampling,20,6,193072000.0,3306341.0
7,censuslowd,group-sampling,30,5,160976700.0,2235474.0
8,censuslowd,group-sampling,40,5,142092400.0,2177736.0
9,censuslowd,group-sampling,50,4,129772700.0,1330896.0


In [8]:
df = pd.pivot_table(df_aggr_cost_data, values="cost_mean", index=["algorithm"], columns=["k"])
df#.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
basic-clustering,244585500.0,190738600.0,159381900.0,138619500.0,128190800.0
group-sampling,247746400.0,193072000.0,160976700.0,142092400.0,129772700.0
sensitivity-sampling,252523800.0,189021200.0,159730100.0,142898900.0,129797500.0


In [9]:
df = pd.pivot_table(df_aggr_cost_data, values="cost_std", index=["algorithm"], columns=["k"])
df#.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
group-sampling,7018345.0,2811315.0,2273345.0,2515538.0,1102451.0
sensitivity-sampling,4799339.0,2965414.0,2732427.0,2226087.0,1075695.0
