In [1]:
import re

from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances

In [2]:
def get_weighted_costs(file_paths):
    costs = []
    for index, cost_path in enumerate(file_paths):
        dataset, algorithm, k = re.findall(r"/.+/(.+)/(.+)-k(\d+)-", str(cost_path))[0]
        with open(cost_path, "r") as f:
            weighted_cost = float(f.read())
        costs.append({
            "dataset": dataset,
            "algorithm": algorithm,
            "k": k,
            "cost": weighted_cost,
            "file_path": str(cost_path)
        })
    return pd.DataFrame(costs)

In [3]:
# data_results_dir = Path("../data/results/tower/")
data_results_dir = Path("../data/results/covertype/")
# data_results_dir = Path("../data/results/census/")

In [4]:
cost_paths = list(data_results_dir.glob("**/weighted_cost.txt"))

In [5]:
len(cost_paths)

246

In [6]:
df_cost_data = get_weighted_costs(cost_paths)

In [7]:
df_aggr_cost_data = df_cost_data.groupby(["dataset", "algorithm", "k"], as_index=False).agg(
    cost_count=("cost", "count"),
    cost_mean=("cost", "mean"),
    cost_std=("cost", "std")
)
df_aggr_cost_data

Unnamed: 0,dataset,algorithm,k,cost_count,cost_mean,cost_std
0,covertype,bico,10,17,308436700000.0,1185021000.0
1,covertype,bico,20,17,184147900000.0,1089146000.0
2,covertype,bico,30,17,141107900000.0,878743600.0
3,covertype,bico,40,17,119219100000.0,613017400.0
4,covertype,bico,50,17,105992800000.0,474772500.0
5,covertype,group-sampling,10,17,325812600000.0,2529468000.0
6,covertype,group-sampling,20,16,197350900000.0,1932491000.0
7,covertype,group-sampling,30,16,149788500000.0,906708900.0
8,covertype,group-sampling,40,16,125232200000.0,822097100.0
9,covertype,group-sampling,50,16,109906200000.0,517849300.0


In [8]:
df = pd.pivot_table(df_aggr_cost_data, values="cost_mean", index=["algorithm"], columns=["k"])
df.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,308436725384,184147913356,141107926238,119219073957,105992823416
group-sampling,325812604646,197350906928,149788536849,125232163925,109906218454
sensitivity-sampling,334405891480,201704900579,153814257452,128182175570,112422821982


In [9]:
df = pd.pivot_table(df_aggr_cost_data, values="cost_std", index=["algorithm"], columns=["k"])
df.style.format(precision=0, thousands=",").highlight_min(color = 'lightgreen', axis = 0)

k,10,20,30,40,50
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bico,1185021232,1089145752,878743582,613017419,474772510
group-sampling,2529467843,1932491142,906708919,822097053,517849325
sensitivity-sampling,1981484582,1837283055,1621327072,925418593,728693915
