In [1]:
import json
import os
import re

import pandas as pd
from pathlib import Path

In [2]:
experiment_path = Path("../data/experiments/cluster/fully_observed")

In [3]:
def recursive_split(path):
    rest, tail = os.path.split(path)
    if rest in  ('', os.path.sep):
        return tail,
    return recursive_split(rest) + (tail,)

recursive_split(experiment_path)

('..', 'data', 'experiments', 'cluster', 'fully_observed')

In [4]:
def read_experiment(path):
    
    objects = path.rglob('*.csv')
    data = []
    depth = len(recursive_split(experiment_path)) + 6
    for obj in objects:
        data.append(recursive_split(obj)[0:depth] + (obj,))
    
    df = pd.DataFrame(data=data)
    columns = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "file_or_dir", "path"]
    auto_columns = []
    for i in range(df.shape[1] - len(columns)):
        auto_columns.append(f"col{i}")
    df.columns = auto_columns + columns
    df.drop(auto_columns, axis=1, inplace=True)
    df = df[df["file_or_dir"].str.endswith(".csv")] # remove sub-directories at this level
    df.rename(columns={"file_or_dir": "file"}, inplace=True)
    
    return df.reset_index(drop=True)

read_experiment(experiment_path).head()

Unnamed: 0,experiment,imputer,task,missing_type,missing_fraction,strategy,file,path
0,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,impute_performance_std_input4.csv,../data/experiments/cluster/fully_observed/Aut...
1,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,downstream_performance_std_input4.csv,../data/experiments/cluster/fully_observed/Aut...
2,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,impute_performance_mean_input4.csv,../data/experiments/cluster/fully_observed/Aut...
3,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,downstream_performance_mean_input4.csv,../data/experiments/cluster/fully_observed/Aut...
4,fully_observed,AutoKerasImputer,32,MCAR,0.3,single_all,impute_performance_mean_input8.csv,../data/experiments/cluster/fully_observed/Aut...


In [5]:
def read_prefixed_csv_files(df_in, file_prefix):
    col_pattern = f"({file_prefix}_)(\\S*)(.csv)"
    dfs = []
    for row in df_in[df_in["file"].str.startswith(file_prefix)].iterrows():
        df_new = pd.read_csv(row[1]["path"])
        df_new.rename({"Unnamed: 0": "metric"}, inplace=True, axis=1)
        df_new["experiment"] = row[1]["experiment"]
        df_new["imputer"] = row[1]["imputer"]
        df_new["task"] = row[1]["task"]    
        df_new["missing_type"] = row[1]["missing_type"]
        df_new["missing_fraction"] = row[1]["missing_fraction"]
        df_new["strategy"] = row[1]["strategy"]
        try:
            df_new["column"] = re.findall(col_pattern, row[1]["file"])[0][1]
        except:
            pass
        df_new["result_type"] = file_prefix
        dfs.append(df_new)
    return pd.concat(dfs, ignore_index=True)

In [6]:
def read_csv_files(df):
    prefixes = [
        "impute_performance_std", 
        "impute_performance_mean", 
        "downstream_performance_std", 
        "downstream_performance_mean"
    ]
    return pd.concat([read_prefixed_csv_files(df, prefix) for prefix in prefixes], ignore_index=True)

read_csv_files(read_experiment(experiment_path))

Unnamed: 0,metric,train,test,experiment,imputer,task,missing_type,missing_fraction,strategy,column,result_type,baseline,corrupted,imputed
0,MAE,2.423165,3.564116,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,input4,impute_performance_std,,,
1,MSE,30.923196,32.844677,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,input4,impute_performance_std,,,
2,RMSE,2.642595,3.564116,fully_observed,AutoKerasImputer,32,MCAR,0.01,single_all,input4,impute_performance_std,,,
3,MAE,0.441542,1.079771,fully_observed,AutoKerasImputer,32,MCAR,0.3,single_all,input8,impute_performance_std,,,
4,MSE,17.692893,44.049110,fully_observed,AutoKerasImputer,32,MCAR,0.3,single_all,input8,impute_performance_std,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40639,F1_macro,,,fully_observed,KNNImputer,1220,MNAR,0.3,single_all,ad_id,downstream_performance_mean,0.476541,0.477089,0.477089
40640,F1_weighted,,,fully_observed,KNNImputer,1220,MNAR,0.3,single_all,ad_id,downstream_performance_mean,0.761524,0.761731,0.761731
40641,F1_micro,,,fully_observed,KNNImputer,1220,MNAR,0.1,single_all,advertiser_id,downstream_performance_mean,0.827034,0.827059,0.827034
40642,F1_macro,,,fully_observed,KNNImputer,1220,MNAR,0.1,single_all,advertiser_id,downstream_performance_mean,0.476541,0.476162,0.476151
