# Binary Classification - Comparisons Imputed - Baseline - Subset

This notebook aims to compare the performance of the imputation experiments with the baseline (training on fully observed data) as well as the imputation performance on the complete, perturbed dataset compared to the performance on the subset of the dataset.
  
    
Split in Binary Classification, Multiclass Classification and Regression  


In [181]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns

from pathlib import Path

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr

## Binary Classification

In [182]:
# import required datasets

# dataset for Baseline performance (full info)

CLF_METRIC = "Classification Tasks"
REG_METRIC = "Regression Tasks"

DOWNSTREAM_RESULT_TYPE = "downstream_performance_mean"
IMPUTE_RESULT_TYPE = "impute_performance_mean"

FIGURES_PATH = Path(f"../paper/figures/")

# Create new column for all datasets -> Data_Constellation_full (Task, Pattern, Fraction) -> only for Baseline required
baseline = pd.read_csv('binary_classification_fixed_seed.csv')

na_impute_results = baseline[
    (baseline["result_type"] == IMPUTE_RESULT_TYPE) & 
    (baseline["metric"].isin(["F1_macro", "RMSE"]))
]
na_impute_results.drop(["baseline", "corrupted", "imputed"], axis=1, inplace=True)
na_impute_results = na_impute_results[na_impute_results.isna().any(axis=1)]
na_impute_results.shape






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(119, 11)

In [183]:
STRATEGY_TYPE = "single_single"

baseline = baseline[
    (baseline["result_type"] == DOWNSTREAM_RESULT_TYPE) & 
    (baseline["metric"].isin(["F1_macro", "RMSE"]) &
    (baseline["strategy"] == STRATEGY_TYPE))
]

# remove experiments where imputation failed
baseline = baseline.merge(
    na_impute_results,
    how = "left",
    validate = "one_to_one",
    indicator = True,
    suffixes=("", "_imp"),
    on = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "column"]
)
baseline = baseline[baseline["_merge"]=="left_only"]

assert len(baseline["strategy"].unique()) == 1
baseline.drop(["experiment", "strategy", "result_type_imp", "metric_imp", "train", "test", "train_imp", "test_imp", "_merge"], axis=1, inplace=True)


In [184]:
baseline = baseline.rename(
    {
        "imputer": "Imputation_Method",
        "task": "Task",
        "missing_type": "Missing Type",
        "missing_fraction": "Missing Fraction",
        "column": "Column",
        "baseline": "Baseline",
        "imputed": "Imputed",
        "corrupted": "Corrupted"
    },
    axis = 1
)
rename_imputer_dict = {
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "KNN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
}

rename_metric_dict = {
    "F1_macro": CLF_METRIC,
    "RMSE": REG_METRIC
}
baseline = baseline.replace(rename_imputer_dict)
baseline = baseline.replace(rename_metric_dict)

baseline['Missing Type'] = baseline['Missing Type'].astype(str)
baseline['Missing Fraction'] = baseline['Missing Fraction'].astype(str)
baseline['Task'] = baseline['Task'].astype(str)

baseline['Data_Constellation_full'] = baseline['Missing Type'] + ' - ' + baseline['Missing Fraction'] + ' - ' + baseline['Task']
baseline

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,Data_Constellation_full
0,Discriminative DL,1046,MAR,0.01,end,downstream_performance_mean,Classification Tasks,0.817860,0.817590,0.818400,MAR - 0.01 - 1046
1,Discriminative DL,1046,MAR,0.1,end,downstream_performance_mean,Classification Tasks,0.817860,0.816424,0.818487,MAR - 0.1 - 1046
2,Discriminative DL,1046,MAR,0.3,end,downstream_performance_mean,Classification Tasks,0.817860,0.815478,0.816526,MAR - 0.3 - 1046
3,Discriminative DL,1046,MAR,0.5,end,downstream_performance_mean,Classification Tasks,0.817860,0.821053,0.816150,MAR - 0.5 - 1046
4,Discriminative DL,1046,MCAR,0.01,end,downstream_performance_mean,Classification Tasks,0.817860,0.818284,0.817860,MCAR - 0.01 - 1046
...,...,...,...,...,...,...,...,...,...,...,...
2215,VAE,923,MCAR,0.5,isns,downstream_performance_mean,Classification Tasks,0.998249,0.836979,0.939554,MCAR - 0.5 - 923
2216,VAE,923,MNAR,0.01,isns,downstream_performance_mean,Classification Tasks,0.998249,0.994937,0.998249,MNAR - 0.01 - 923
2217,VAE,923,MNAR,0.1,isns,downstream_performance_mean,Classification Tasks,0.998249,0.963120,0.997859,MNAR - 0.1 - 923
2218,VAE,923,MNAR,0.3,isns,downstream_performance_mean,Classification Tasks,0.998249,0.900426,0.980860,MNAR - 0.3 - 923


In [185]:
baseline = baseline[['Imputation_Method','Baseline', 'Data_Constellation_full']]


In [186]:
# dataset for Imputation Performance (full info)
imputation = pd.read_csv('../Binary/binary_imputed_full_info.csv')

subset = pd.read_csv('../Subset - Binary/binary_subset_full_info.csv')
subset = subset.rename(columns={"Performance Difference to Average Best": "Performance Difference to Average Best Subset"}) 
subset = subset[['Imputation_Method','Imputed_Subset', 'Downstream Performance Rank Subset', 'Performance Difference to Average Best Subset', 'Data_Constellation_full']]

#print(len(imputation))
#print(len(subset))


2125
2222


In [187]:
# Merge Datasets to one dataframe via data constellation (Task, Pattern, Fraction)

data = pd.merge(imputation, subset, on=['Data_Constellation_full', 'Imputation_Method'])

data = pd.merge(data, baseline, on=['Data_Constellation_full', 'Imputation_Method'])

data = data.rename(columns={"Baseline_y": "Baseline"}) 
data


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,10.0,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000000e+00,0.698074,5.0,-0.001143,0.667244
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674760,...,10.0,,2.0,MAR - 0.01,MAR - 0.01 - 137,-2.949553e-08,0.698074,4.0,-0.001143,0.667244
2,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674145,0.0,0.674116,...,10.0,,5.0,MAR - 0.01,MAR - 0.01 - 137,-6.439471e-04,0.699217,2.0,0.000000,0.667244
3,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674488,...,10.0,,3.0,MAR - 0.01,MAR - 0.01 - 137,-2.716528e-04,0.699699,1.0,0.000482,0.667244
4,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674505,0.0,0.674251,...,10.0,,4.0,MAR - 0.01,MAR - 0.01 - 137,-5.093850e-04,0.694867,6.0,-0.004350,0.667244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031,KNN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.619680,0.0,0.620028,...,6.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42493,-2.447951e-03,0.606716,2.0,0.014281,0.623939
2032,Mean/Mode,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.627750,0.0,0.627879,...,6.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,5.402509e-03,0.592436,5.0,0.000000,0.620623
2033,VAE,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625687,0.0,0.626914,...,6.0,,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,4.437433e-03,0.611129,1.0,0.018693,0.620623
2034,GAIN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625497,0.0,0.626178,...,6.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42493,3.701885e-03,0.592837,4.0,0.000401,0.620623


## Baseline Comparisons

In [190]:
# Calculate Difference in F1 Score/RMSE between Imputed and Subset for each data constellation -> save in new column
data['Performance Difference Baseline to Imputed'] = (data['Baseline']) - (data['Imputed'])
data

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000000e+00,0.698074,5.0,-0.001143,0.667244,-0.007516
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674760,...,,2.0,MAR - 0.01,MAR - 0.01 - 137,-2.949553e-08,0.698074,4.0,-0.001143,0.667244,-0.007516
2,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674145,0.0,0.674116,...,,5.0,MAR - 0.01,MAR - 0.01 - 137,-6.439471e-04,0.699217,2.0,0.000000,0.667244,-0.006872
3,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674488,...,,3.0,MAR - 0.01,MAR - 0.01 - 137,-2.716528e-04,0.699699,1.0,0.000482,0.667244,-0.007244
4,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674505,0.0,0.674251,...,,4.0,MAR - 0.01,MAR - 0.01 - 137,-5.093850e-04,0.694867,6.0,-0.004350,0.667244,-0.007006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031,KNN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.619680,0.0,0.620028,...,,6.0,MNAR - 0.5,MNAR - 0.5 - 42493,-2.447951e-03,0.606716,2.0,0.014281,0.623939,0.003911
2032,Mean/Mode,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.627750,0.0,0.627879,...,,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,5.402509e-03,0.592436,5.0,0.000000,0.620623,-0.007256
2033,VAE,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625687,0.0,0.626914,...,,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,4.437433e-03,0.611129,1.0,0.018693,0.620623,-0.006291
2034,GAIN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625497,0.0,0.626178,...,,3.0,MNAR - 0.5,MNAR - 0.5 - 42493,3.701885e-03,0.592837,4.0,0.000401,0.620623,-0.005555


In [191]:
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset
Average_Difference_Score = data['Performance Difference Baseline to Imputed'].mean()
print("Average Difference in Predicitve Performance between baseline and full, perturbed dataset", Average_Difference_Score)

# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset -> absolut value
data_temp_abs = data.copy()
data_temp_abs['Performance Difference Baseline to Imputed'] = data_temp_abs['Performance Difference Baseline to Imputed'].abs()

Average_Difference_Score_abs = data_temp_abs['Performance Difference Baseline to Imputed'].mean()
print("Average Difference in Predicitve Performance between baseline and full, perturbed dataset as absolute Values", Average_Difference_Score_abs)


Average Difference in Predicitve Performance between baseline and full, perturbed dataset 0.009205996725568403
Average Difference in Predicitve Performance between baseline and full, perturbed dataset as absolute Values 0.025689745054964585
____________________




In [192]:
improv_to_av_best = data.copy()
improv_to_av_best = improv_to_av_best[improv_to_av_best["Imputation_Method"].str.contains("Random Forest")]
improv_to_av_best_mean = improv_to_av_best['Performance Difference Baseline to Imputed'].mean()
print(improv_to_av_best_mean)
#improv_to_av_best


0.002379274362747037


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.0,0.698074,5.0,-0.001143,0.667244,-0.007516
5,Random Forest,137,MAR,0.10,top-middle-square,downstream_performance_mean,Classification Tasks,0.673767,0.0,0.673526,...,,4.0,MAR - 0.1,MAR - 0.1 - 137,0.0,0.695749,6.0,-0.001619,0.667244,-0.006282
10,Random Forest,137,MAR,0.30,top-middle-square,downstream_performance_mean,Classification Tasks,0.674084,0.0,0.674124,...,,1.0,MAR - 0.3,MAR - 0.3 - 137,0.0,0.698277,4.0,0.000751,0.667244,-0.006879
15,Random Forest,137,MAR,0.50,top-middle-square,downstream_performance_mean,Classification Tasks,0.665708,0.0,0.665258,...,,4.0,MAR - 0.5,MAR - 0.5 - 137,0.0,0.698917,2.0,0.001442,0.667244,0.001987
20,Random Forest,137,MCAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674273,0.0,0.674334,...,,2.0,MCAR - 0.01,MCAR - 0.01 - 137,0.0,0.698074,3.0,0.000000,0.667244,-0.007089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,Random Forest,42493,MCAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625924,0.0,0.624029,...,,1.0,MCAR - 0.5,MCAR - 0.5 - 42493,0.0,0.606759,1.0,0.006597,0.620623,-0.003406
2014,Random Forest,42493,MNAR,0.01,Length,downstream_performance_mean,Classification Tasks,0.627164,0.0,0.626956,...,,5.0,MNAR - 0.01,MNAR - 0.01 - 42493,0.0,0.595962,6.0,-0.018274,0.620623,-0.006333
2019,Random Forest,42493,MNAR,0.10,Length,downstream_performance_mean,Classification Tasks,0.625641,0.0,0.625463,...,,5.0,MNAR - 0.1,MNAR - 0.1 - 42493,0.0,0.611310,4.0,-0.002925,0.620623,-0.004840
2025,Random Forest,42493,MNAR,0.30,Length,downstream_performance_mean,Classification Tasks,0.624441,0.0,0.623823,...,,3.0,MNAR - 0.3,MNAR - 0.3 - 42493,0.0,0.597225,4.0,-0.002901,0.620623,-0.003201


In [193]:
data_heatmaps = data.copy()
data_backup = data.copy()

data_heatmaps['Missing Fraction'] = data_heatmaps['Missing Fraction'].astype(str)
data_heatmaps['Missing Type'] = data_heatmaps['Missing Type'].astype(str)

In [194]:
# Heatmap with differences per Dataconstellation Baseline to full, perturbed dataset
data_heat = data.copy()
data_heat = data_heat.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = data_heat.loc[data_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference Baseline to Imputed"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.11),
                    zmax=0.11,
                    #hoverinfo='text',
                    #text=hovertext
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("binary_heatmap_f1_score_improvement_Baseline_to_Imputed%s.pdf" %i)
    # Positive value indicates, that Baseline is better than full, corrupted dataset score 

In [195]:
# sorting data by total improvement
df_quantiles = data_heatmaps.copy()

df_quantiles
#df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)
df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.09)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.07)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.05)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.03)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.01)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.01)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.03)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.05)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.07)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.09))].index)



In [196]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []

quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Performance Difference Baseline to Imputed': quantiles,
     'Amount': quantile_freq,
    })


[4, 26, 61, 181, 83, 1086, 275, 118, 55, 39, 108]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [197]:
fig = px.bar(improvement_quantiles, x='Performance Difference Baseline to Imputed', y='Amount')
fig.show()
fig.write_image("binary_performance_difference_baseline_to_imputed.pdf")


In [198]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []



for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[1, 6, 14, 33, 14, 209, 40, 17, 7, 4, 15]
[3, 7, 5, 34, 20, 191, 43, 23, 14, 9, 11]
[0, 0, 5, 42, 11, 189, 60, 17, 11, 7, 18]
[0, 1, 8, 31, 6, 185, 53, 29, 10, 6, 14]
[0, 2, 12, 24, 18, 190, 52, 23, 8, 6, 25]
[0, 10, 17, 17, 14, 122, 27, 9, 5, 7, 25]


In [199]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("binary_performance_difference_baseline_to_imputed_per_method.pdf")

In [200]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[0, 8, 13, 59, 13, 338, 55, 12, 5, 0, 5]
[0, 7, 10, 41, 28, 272, 99, 25, 5, 5, 17]
[1, 6, 17, 40, 21, 239, 76, 41, 14, 16, 37]
[3, 5, 21, 41, 21, 237, 45, 40, 31, 18, 49]


In [201]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("binary_performance_difference_baseline_to_imputed_per_frac.pdf")

In [202]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 
                                  
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[0, 11, 25, 55, 27, 367, 95, 37, 17, 10, 33]
[2, 5, 17, 60, 32, 354, 95, 43, 16, 13, 44]
[2, 10, 19, 66, 24, 365, 85, 38, 22, 16, 31]


In [203]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("binary_performance_difference_baseline_to_imputed_per_patt.pdf")

## Subset Comparisons

In [204]:
# Subset Comparison Analysis here
data = data_backup.copy()
# Calculate Difference in F1 Score/RMSE between Imputed and Subset for each data constellation -> save in new column
data['Performance Difference Imputed to Subset'] = (data['Imputed']) - (data['Imputed_Subset'])

# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset
Average_Difference_Score = data['Performance Difference Imputed to Subset'].mean()
print("Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)

# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset -> absolut value
data_temp_abs = data.copy()
data_temp_abs['Performance Difference Imputed to Subset'] = data_temp_abs['Performance Difference Imputed to Subset'].abs()

Average_Difference_Score_abs = data_temp_abs['Performance Difference Imputed to Subset'].mean()
print("Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
print('____________________')
print('\n')
# Filter full dataset for Imputation Method
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Imputation Method
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL'] 
for i in methods:
    data_method = data.loc[data['Imputation_Method'] == i]
    Average_Difference_Score = data_method['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_method['Performance Difference Imputed to Subset'] = data_method['Performance Difference Imputed to Subset'].abs()
    Average_Difference_Score_abs = data_method['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')
    
# Filter full dataset for Missingness Pattern
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Missingness Pattern
patterns = ['MCAR', 'MAR', 'MNAR']
for i in patterns:
    data_patterns = data.loc[data['Missing Type'] == i]
    Average_Difference_Score = data_patterns['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_patterns['Performance Difference Imputed to Subset'] = data_patterns['Performance Difference Imputed to Subset'].abs()
    Average_Difference_Score_abs = data_patterns['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')    
    
# Filter full dataset for Missingness Fraction
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Missingness Fraction
print("_______________________________________________________________________________")
fractions = [0.01, 0.1, 0.3, 0.5]
for i in fractions:
    data_fractions = data.loc[data['Missing Fraction'] == i]
    Average_Difference_Score = data_fractions['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_fractions['Performance Difference Imputed to Subset'] = data_fractions['Performance Difference Imputed to Subset'].abs()
    Average_Difference_Score_abs = data_fractions['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')    





Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.008234004080140972
Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.0384429084759585
____________________


Random Forest Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.007353208710982208
Random Forest Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.03913281059292054
____________________


KNN Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.009901649283874882
KNN Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.03803782154563733
____________________


Mean/Mode Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.003789291411343316
Mean/Mode Average Difference in Predicitve Performance between full, 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [205]:
data_heatmaps = data.copy()

data_heatmaps['Missing Fraction'] = data_heatmaps['Missing Fraction'].astype(str)
data_heatmaps['Missing Type'] = data_heatmaps['Missing Type'].astype(str)

In [207]:
# Heatmap with differences per Dataconstellation relative to full, perturbed dataset
data_heat = data.copy()
data_heat = data_heat.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = data_heat.loc[data_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference Imputed to Subset"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.11),
                    zmax=0.11,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("binary_heatmap_f1_score_improvement_Imputed_to_Subset%s.pdf" %i)

    # Positive value indicates, that full, corrupted dataset score is better than the score for the subset

In [208]:
# sorting data by total improvement
df_quantiles = data_heatmaps.copy()

df_quantiles
#df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)
df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.09)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.07)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.05)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.03)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.01)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.01)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.03)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.05)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.07)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.09))].index)



In [209]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []

quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Performance Difference Imputed to Subset': quantiles,
     'Amount': quantile_freq,
    })


[70, 63, 85, 117, 257, 552, 376, 181, 137, 72, 126]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [210]:
fig = px.bar(improvement_quantiles, x='Performance Difference Imputed to Subset', y='Amount')
fig.show()
fig.write_image("binary_performance_difference_imputed_to_subset.pdf")


In [211]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []



for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()

        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[15, 11, 15, 20, 37, 96, 72, 36, 25, 17, 16]
[12, 10, 13, 20, 43, 102, 69, 27, 27, 11, 26]
[17, 14, 21, 17, 44, 90, 78, 28, 17, 14, 20]
[10, 7, 13, 18, 46, 98, 55, 38, 22, 11, 25]
[7, 12, 16, 21, 55, 100, 69, 27, 23, 7, 23]
[9, 9, 7, 21, 32, 66, 33, 25, 23, 12, 16]


In [212]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("binary_performance_difference_imputed_to_subset_per_method.pdf")

In [213]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
#print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()

        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[7, 19, 15, 31, 75, 136, 96, 37, 29, 25, 38]
[20, 17, 25, 22, 64, 135, 104, 44, 30, 18, 30]
[28, 12, 25, 28, 61, 134, 87, 51, 37, 14, 31]
[15, 15, 20, 36, 57, 147, 89, 49, 41, 15, 27]


In [214]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("binary_performance_difference_imputed_to_subset_per_frac.pdf")

In [215]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
#print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]

        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                                                     
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[22, 22, 25, 42, 79, 184, 133, 55, 48, 23, 44]
[27, 21, 33, 43, 86, 188, 116, 63, 44, 22, 38]
[21, 20, 27, 32, 92, 180, 127, 63, 45, 27, 44]


In [216]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("binary_performance_difference_imputed_to_subset_per_patt.pdf")

## Comparison based on Rank

In [217]:

subset_rank_acc = data_heatmaps.copy()
#subset_rank_acc


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed,Performance Difference Imputed to Subset
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000000e+00,0.698074,5.0,-0.001143,0.667244,-0.007516,-0.023314
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674760,...,2.0,MAR - 0.01,MAR - 0.01 - 137,-2.949553e-08,0.698074,4.0,-0.001143,0.667244,-0.007516,-0.023314
2,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674145,0.0,0.674116,...,5.0,MAR - 0.01,MAR - 0.01 - 137,-6.439471e-04,0.699217,2.0,0.000000,0.667244,-0.006872,-0.025101
3,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674488,...,3.0,MAR - 0.01,MAR - 0.01 - 137,-2.716528e-04,0.699699,1.0,0.000482,0.667244,-0.007244,-0.025211
4,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674505,0.0,0.674251,...,4.0,MAR - 0.01,MAR - 0.01 - 137,-5.093850e-04,0.694867,6.0,-0.004350,0.667244,-0.007006,-0.020616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031,KNN,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.619680,0.0,0.620028,...,6.0,MNAR - 0.5,MNAR - 0.5 - 42493,-2.447951e-03,0.606716,2.0,0.014281,0.623939,0.003911,0.013312
2032,Mean/Mode,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.627750,0.0,0.627879,...,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,5.402509e-03,0.592436,5.0,0.000000,0.620623,-0.007256,0.035443
2033,VAE,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.625687,0.0,0.626914,...,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,4.437433e-03,0.611129,1.0,0.018693,0.620623,-0.006291,0.015785
2034,GAIN,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.625497,0.0,0.626178,...,3.0,MNAR - 0.5,MNAR - 0.5 - 42493,3.701885e-03,0.592837,4.0,0.000401,0.620623,-0.005555,0.033342


In [218]:
subset_rank_acc = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank'] == 1.0]
print(len(subset_rank_acc))
subset_rank_acc_right = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank Subset'] == 1.0]
print(len(subset_rank_acc_right))
#subset_rank_acc_right.to_csv('subset_rank_acc_right.csv')
subset_rank_acc_wrong = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank Subset'] != 1.0]
print(len(subset_rank_acc_wrong))
#subset_rank_acc_wrong.to_csv('subset_rank_acc_wrong.csv')

352
86
266


In [219]:
print(len(subset_rank_acc))
subset_rank_acc_mean_diff = subset_rank_acc['Performance Difference to Average Best Subset'].mean()
print(subset_rank_acc_mean_diff)

352
0.00024179474886399074


In [220]:
#subset_rank_acc_right

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed,Performance Difference Imputed to Subset
16,KNN,137,MAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.677718,0.0,0.678583,...,1.0,MAR - 0.5,MAR - 0.5 - 137,0.013326,0.706240,1.0,0.008765,0.667244,-0.011339,-0.027656
35,Random Forest,137,MCAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.673273,0.0,0.673426,...,1.0,MCAR - 0.5,MCAR - 0.5 - 137,0.000000,0.705479,1.0,0.004575,0.667244,-0.006182,-0.032053
75,Random Forest,151,MAR,0.5,nswprice,downstream_performance_mean,Classification Tasks,0.683725,0.0,0.707864,...,1.0,MAR - 0.5,MAR - 0.5 - 151,0.000000,0.716061,1.0,0.033839,0.747959,0.040096,-0.008197
90,Random Forest,151,MCAR,0.3,nswprice,downstream_performance_mean,Classification Tasks,0.735629,0.0,0.735467,...,1.0,MCAR - 0.3,MCAR - 0.3 - 151,0.000000,0.747903,1.0,0.029276,0.747959,0.012493,-0.012437
95,Random Forest,151,MCAR,0.5,nswprice,downstream_performance_mean,Classification Tasks,0.729388,0.0,0.728413,...,1.0,MCAR - 0.5,MCAR - 0.5 - 151,0.000000,0.730254,1.0,0.026433,0.747959,0.019546,-0.001840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849,GAIN,42192,MAR,0.1,age,downstream_performance_mean,Classification Tasks,0.664905,0.0,0.664171,...,1.0,MAR - 0.1,MAR - 0.1 - 42192,0.000780,0.709289,1.0,0.002464,0.655712,-0.008459,-0.045118
1893,Random Forest,42192,MNAR,0.1,age,downstream_performance_mean,Classification Tasks,0.659903,0.0,0.661885,...,1.0,MNAR - 0.1,MNAR - 0.1 - 42192,0.000000,0.708710,1.0,0.009748,0.655712,-0.006173,-0.046825
1954,Discriminative DL,42477,MNAR,0.01,x1,downstream_performance_mean,Classification Tasks,0.627422,0.0,0.627920,...,1.0,MNAR - 0.01,MNAR - 0.01 - 42477,0.000602,0.626449,1.0,0.023694,0.662695,0.034775,0.001471
1993,Random Forest,42493,MCAR,0.01,Length,downstream_performance_mean,Classification Tasks,0.626305,0.0,0.626669,...,1.0,MCAR - 0.01,MCAR - 0.01 - 42493,0.000000,0.614237,1.0,0.001446,0.620623,-0.006046,0.012433


In [222]:
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []


for i in methods:
    df_temp = subset_rank_acc_right.loc[subset_rank_acc_right['Imputation_Method'] == i]
    df_temp_len = len(df_temp.index)
    if (i == 'Random Forest'):
        forest_freq.append(df_temp_len)
    elif (i == 'KNN'):
        knn_freq.append(df_temp_len)                                       
    elif (i == 'Mean/Mode'):
        mode_freq.append(df_temp_len)                                                 
    elif (i == 'Discriminative DL'):
        dl_freq.append(df_temp_len)                                       
    elif (i == 'VAE'):
        vae_freq.append(df_temp_len)                                         
    elif (i == 'GAIN'):
        gain_freq.append(df_temp_len)                                          
print("Subset with same predicition as full, corrupted")
print(forest_freq, 'Random Forest')
print(knn_freq, 'KNN')
print(mode_freq, 'Mode')
print(dl_freq, 'DL')
print(vae_freq, 'VAE')
print(gain_freq, 'GAIN')

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
Subset with same predicition as full, corrupted
[21] Random Forest
[11] KNN
[31] Mode
[8] DL
[6] VAE
[9] GAIN


In [223]:
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []


for i in methods:
    df_temp = subset_rank_acc_wrong.loc[subset_rank_acc_wrong['Imputation_Method'] == i]
    df_temp_len = len(df_temp.index)
    if (i == 'Random Forest'):
        forest_freq.append(df_temp_len)
    elif (i == 'KNN'):
        knn_freq.append(df_temp_len)                                       
    elif (i == 'Mean/Mode'):
        mode_freq.append(df_temp_len)                                                 
    elif (i == 'Discriminative DL'):
        dl_freq.append(df_temp_len)                                       
    elif (i == 'VAE'):
        vae_freq.append(df_temp_len)                                         
    elif (i == 'GAIN'):
        gain_freq.append(df_temp_len)                                          

print("Subset with different predicition than full, corrupted")
print(forest_freq, 'Random Forest')
print(knn_freq, 'KNN')
print(mode_freq, 'Mode')
print(dl_freq, 'DL')
print(vae_freq, 'VAE')
print(gain_freq, 'GAIN')

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
Subset with different predicition than full, corrupted
[68] Random Forest
[49] KNN
[34] Mode
[40] DL
[46] VAE
[29] GAIN
