In [1]:
import json
import os
import re

import pandas as pd
from pathlib import Path

In [2]:
EXPERIMENT = "0815"

In [3]:
base_path = Path("../data/experiments")
experiment_path = base_path / EXPERIMENT

In [4]:
def recursive_split(path):
    rest, tail = os.path.split(path)
    if rest in  ('', os.path.sep):
        return tail,
    return recursive_split(rest) + (tail,)

recursive_split(experiment_path)

('..', 'data', 'experiments', '0815')

In [5]:
def read_experiment(path):
    objects = path.rglob('*.csv')
    data = []
    for obj in objects:
        data.append(recursive_split(obj)[0:10] + (obj,))
    columns = ["col1", "col2", "col3", "experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "file_or_dir", "path"]
    df = pd.DataFrame(data=data, columns=columns)
    df.drop(["col1", "col2", "col3"], axis=1, inplace=True)
    df = df[df["file_or_dir"].str.endswith(".csv")] # avoid reading sub-directories
    df.rename(columns={"file_or_dir": "file"}, inplace=True)
    return df.reset_index(drop=True)

read_experiment(experiment_path).head()

Unnamed: 0,experiment,imputer,task,missing_type,missing_fraction,strategy,file,path
0,815,ModeImputer,1459,MAR,0.5,single_all,impute_performance_std_V4.csv,../data/experiments/0815/ModeImputer/1459/MAR/...
1,815,ModeImputer,1459,MAR,0.5,single_all,downstream_performance_std_V4.csv,../data/experiments/0815/ModeImputer/1459/MAR/...
2,815,ModeImputer,1459,MAR,0.5,single_all,impute_performance_mean_V4.csv,../data/experiments/0815/ModeImputer/1459/MAR/...
3,815,ModeImputer,1459,MAR,0.5,single_all,downstream_performance_mean_V4.csv,../data/experiments/0815/ModeImputer/1459/MAR/...
4,815,ModeImputer,1459,MAR,0.5,single_single,impute_performance_std_V5.csv,../data/experiments/0815/ModeImputer/1459/MAR/...


In [6]:
def read_prefixed_csv_files(df_in, file_prefix):
    col_pattern = f"({file_prefix}_)(\\S*)(.csv)"
    dfs = []
    for row in df_in[df_in["file"].str.startswith(file_prefix)].iterrows():
        df_new = pd.read_csv(row[1]["path"])
        df_new.rename({"Unnamed: 0": "metric"}, inplace=True, axis=1)
        df_new["experiment"] = row[1]["experiment"]
        df_new["imputer"] = row[1]["imputer"]
        df_new["task"] = row[1]["task"]    
        df_new["missing_type"] = row[1]["missing_type"]
        df_new["missing_fraction"] = row[1]["missing_fraction"]
        df_new["strategy"] = row[1]["strategy"]
        try:
            df_new["column"] = re.findall(col_pattern, row[1]["file"])[0][1]
        except:
            pass
        df_new["result_type"] = file_prefix
        dfs.append(df_new)
    return pd.concat(dfs, ignore_index=True)

In [7]:
def read_csv_files(df):
    prefixes = [
        "impute_performance_std", 
        "impute_performance_mean", 
        "downstream_performance_std", 
        "downstream_performance_mean"
    ]
    return pd.concat([read_prefixed_csv_files(df, prefix) for prefix in prefixes], ignore_index=True)

read_csv_files(read_experiment(experiment_path))

Unnamed: 0,metric,train,test,experiment,imputer,task,missing_type,missing_fraction,strategy,column,result_type,baseline,corrupted,imputed
0,MAE,2.669271,1.027342,0815,ModeImputer,1459,MAR,0.5,single_all,V4,impute_performance_std,,,
1,MSE,69.292870,22.993417,0815,ModeImputer,1459,MAR,0.5,single_all,V4,impute_performance_std,,,
2,RMSE,3.174474,1.211478,0815,ModeImputer,1459,MAR,0.5,single_all,V4,impute_performance_std,,,
3,MAE,0.642248,0.629532,0815,ModeImputer,1459,MAR,0.5,single_single,V5,impute_performance_std,,,
4,MSE,19.828447,24.355699,0815,ModeImputer,1459,MAR,0.5,single_single,V5,impute_performance_std,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,F1_macro,,,0815,KNNImputer,4552,MNAR,0.2,single_all,V5,downstream_performance_mean,0.422599,0.402966,0.403619
572,F1_weighted,,,0815,KNNImputer,4552,MNAR,0.2,single_all,V5,downstream_performance_mean,0.723594,0.710078,0.711006
573,F1_micro,,,0815,KNNImputer,4552,MNAR,0.2,single_single,V8,downstream_performance_mean,0.726390,0.726390,0.725949
574,F1_macro,,,0815,KNNImputer,4552,MNAR,0.2,single_single,V8,downstream_performance_mean,0.422599,0.422599,0.423624
