In [1]:
import glob
import json

import pandas as pd

from pathlib import Path

from data_imputation_paper.experiment import _recursive_split

In [2]:
glob_a = str(Path(f"../data/experiments/fully_observed") / "**" / "MCAR" / "**" / "single_all" / "elapsed_train_time_*.json")
glob_b = str(Path(f"../data/experiments/corrupted") / "**" / "MCAR" / "**" / "single_all" / "elapsed_train_time_*.json")
training_time_files = [*glob.glob(glob_a, recursive=True), *glob.glob(glob_b, recursive=True)]

In [3]:
column_names = ["imputer", "task", "type", "fraction"]

dfs = []

for path in training_time_files:
    df = pd.read_json(path, orient="index").T.reset_index(drop=True)
    df[column_names] = _recursive_split(path)[4:-2]
    dfs.append(df)

training_time_all = pd.concat(dfs)

In [4]:
training_time = training_time_all.drop("std", axis=1)
training_time = training_time.rename(columns={"mean": "training_time"})
training_time = training_time.replace({
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "$k$-NN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
})

In [5]:
# First calculate mean and relative std for each imputer and task
training_time_grouped = training_time.groupby(["imputer", "task"]).agg(["mean", "std"])
training_time_grouped.columns = ["mean", "std"]
training_time_grouped["rel std"] = training_time_grouped.loc[:, "std"] /  training_time_grouped.loc[:, "mean"]

# Then average over all data sets. This leads to relativ sd that is less dependant on the data set size.
training_time_grouped = training_time_grouped.groupby("imputer").agg(["mean", "std"])
training_time_grouped = training_time_grouped.loc[:, [("mean", "mean"), ("rel std", "mean")]]

# Predict Time

In [6]:
predict_time_files = glob.glob(str(Path(f"../data/experiments/time_measure_predict") / "**" / "MCAR" / "**" / "single_single" / "elapsed_train_time_*.json"), recursive=True)

In [7]:
column_names = ["imputer", "task", "type", "fraction"]

dfs = []

for path in predict_time_files:
    df = pd.read_json(path, orient="index").T.reset_index(drop=True)
    df[column_names] = _recursive_split(path)[4:-2]
    dfs.append(df)

predict_time_all = pd.concat(dfs)

In [8]:
predict_time = predict_time_all.drop("std", axis=1)
predict_time = predict_time.rename(columns={"mean": "predict_time"})
predict_time = predict_time.replace({
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "$k$-NN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
})

In [9]:
# First calculate mean and relative std for each imputer and task
predict_time_grouped = predict_time.groupby(["imputer", "task"]).agg(["mean", "std"])
predict_time_grouped.columns = ["mean", "std"]
predict_time_grouped["rel std"] = predict_time_grouped.loc[:, "std"] /  predict_time_grouped.loc[:, "mean"]

# Then average over all data sets. This leads to relativ sd that is less dependant on the data set size.
predict_time_grouped = predict_time_grouped.groupby("imputer").agg(["mean", "std"])
predict_time_grouped = predict_time_grouped.loc[:, [("mean", "mean"), ("rel std", "mean")]]
predict_time_grouped = predict_time_grouped.loc[["Mean/Mode", "$k$-NN", "Random Forest", "Discriminative DL", "VAE", "GAIN"],:].reset_index()

In [10]:
predict_time_grouped

Unnamed: 0_level_0,imputer,mean,rel std
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean
0,Mean/Mode,0.029195,0.171139
1,$k$-NN,7.01833,0.602026
2,Random Forest,44.047614,0.236052
3,Discriminative DL,440.388738,0.210621
4,VAE,11.21483,0.084604
5,GAIN,137.96578,0.083426


# Latex Table

In [11]:
table_latex = training_time_grouped.loc[["Mean/Mode", "$k$-NN", "Random Forest", "Discriminative DL", "VAE", "GAIN"],:].reset_index()
table_latex[("a", "mean")] = predict_time_grouped[("mean", "mean")]
table_latex[("a", "std")] = predict_time_grouped[("rel std", "mean")]
table_latex = pd.DataFrame(table_latex.values, columns=pd.MultiIndex.from_tuples([("Imputation Method", ""), ("Training", "Mean Duration"), ("Training", "Rel. SD"), ("Inference", "Mean Duration"), ("Inference", "Rel. SD")]))
table_latex

Unnamed: 0_level_0,Imputation Method,Training,Training,Inference,Inference
Unnamed: 0_level_1,Unnamed: 1_level_1,Mean Duration,Rel. SD,Mean Duration,Rel. SD
0,Mean/Mode,0.005277,0.550878,0.029195,0.171139
1,$k$-NN,41.204365,0.253716,7.01833,0.602026
2,Random Forest,226.076551,0.119295,44.047614,0.236052
3,Discriminative DL,6275.019244,0.40505,440.388738,0.210621
4,VAE,71.095282,0.098795,11.21483,0.084604
5,GAIN,878.058286,0.311553,137.96578,0.083426


In [12]:
print(
    table_latex.to_latex(
        caption="Training and inference duration for each imputation method in seconds. We use the wall-time to measure the durations for training including hyperparameter optimization and inference for MCAR missingness pattern and all missingness fractions shown in Table TODO. Because training and inference durations depend heavily on the data set, we first average all measurements for imputation method and data set combinations and calculate the standard deviation relatives relative to there mean durations. Second, we average both mean durations and relative standard devaition for the imputation methods. Abbreviations: Rel. SD means Relative Standard Deviation.",
        label="tab:time",
        index=False,
        escape=False
    )
)

\begin{table}
\centering
\caption{Training and inference duration for each imputation method in seconds. We use the wall-time to measure the durations for training including hyperparameter optimization and inference for MCAR missingness pattern and all missingness fractions shown in Table TODO. Because training and inference durations depend heavily on the data set, we first average all measurements for imputation method and data set combinations and calculate the standard deviation relatives relative to there mean durations. Second, we average both mean durations and relative standard devaition for the imputation methods. Abbreviations: Rel. SD means Relative Standard Deviation.}
\label{tab:time}
\begin{tabular}{lllll}
\toprule
Imputation Method & \multicolumn{2}{l}{Training} & \multicolumn{2}{l}{Inference} \\
                  & Mean Duration &   Rel. SD & Mean Duration &   Rel. SD \\
\midrule
        Mean/Mode &      0.005277 &  0.550878 &      0.029195 &  0.171139 \\
           $k$