In [1]:
import glob
import json

import pandas as pd

from pathlib import Path

from data_imputation_paper.experiment import _recursive_split

In [2]:
glob_a = str(Path(f"../data/experiments/fully_observed") / "**" / "single_all" / "elapsed_train_time_*.json")
glob_b = str(Path(f"../data/experiments/corrupted") / "**" / "single_all" / "elapsed_train_time_*.json")
training_time_files = [*glob.glob(glob_a, recursive=True), *glob.glob(glob_b, recursive=True)]

In [3]:
column_names = ["imputer", "task", "type", "fraction"]

dfs = []

for path in training_time_files:
    df = pd.read_json(path, orient="index").T.reset_index(drop=True)
    df[column_names] = _recursive_split(path)[4:-2]
    dfs.append(df)

training_time_all = pd.concat(dfs)

In [33]:
training_time = training_time_all.drop("std", axis=1)
training_time = training_time.rename(columns={"mean": "training_time"})
training_time = training_time.replace({
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "$k$-NN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminate DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
})

In [133]:
# First calculate mean and relative std for each imputer and task
training_time_grouped = training_time.groupby(["imputer", "task"]).agg(["mean", "std"])
training_time_grouped.columns = ["mean", "std"]
training_time_grouped["rel std"] = training_time_grouped.loc[:, "std"] /  training_time_grouped.loc[:, "mean"]

# Then average over all data sets. This leads to relativ sd that is less dependant on the data set size.
training_time_grouped = training_time_grouped.groupby("imputer").agg(["mean", "std"])
training_time_grouped = training_time_grouped.loc[:, [("mean", "mean"), ("rel std", "mean")]]

In [134]:
table_latex = training_time_grouped.loc[["Mean/Mode", "$k$-NN", "Random Forest", "Discriminate DL", "VAE", "GAIN"],:].reset_index()
table_latex.columns = ["Imputation Method", "Training Time", "Relative Standard Deviation"]
table_latex

Unnamed: 0,Imputation Method,Training Time,Relative Standard Deviation
0,Mean/Mode,0.005009,0.656056
1,$k$-NN,40.961577,0.24302
2,Random Forest,225.513999,0.118707
3,Discriminate DL,6285.017741,0.424011
4,VAE,71.685278,0.107189
5,GAIN,874.657293,0.299608


In [137]:
print(
    table_latex.to_latex(
        caption="Training time for each imputation method in seconds. Training time is the mean overall experimental settings, experiments, and scenarios. The data set's size skews the standard deviation heavily, which is why we first compute the relative standard deviation for each imputation method on each data set separately and then average over the data sets.",
        label="tab:training_time",
        index=False,
        escape=False
    )
)

\begin{table}
\centering
\caption{Training time for each imputation method in seconds. Training time is the mean overall experimental settings, experiments, and scenarios. The data set's size skews the standard deviation heavily, which is why we first compute the relative standard deviation for each imputation method on each data set separately and then average over the data sets.}
\label{tab:training_time}
\begin{tabular}{lrr}
\toprule
Imputation Method &  Training Time &  Relative Standard Deviation \\
\midrule
        Mean/Mode &       0.005009 &                     0.656056 \\
           $k$-NN &      40.961577 &                     0.243020 \\
    Random Forest &     225.513999 &                     0.118707 \\
  Discriminate DL &    6285.017741 &                     0.424011 \\
              VAE &      71.685278 &                     0.107189 \\
             GAIN &     874.657293 &                     0.299608 \\
\bottomrule
\end{tabular}
\end{table}

