# Multiclass Classification - Comparisons Imputed - Baseline - Subset

This notebook aims to compare the performance of the imputation experiments with the baseline (training on fully observed data) as well as the imputation performance on the complete, perturbed dataset compared to the performance on the subset of the dataset.
  
    
Split in Binary Classification, Multiclass Classification and Regression  


In [124]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns

from pathlib import Path

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr

In [125]:
# import required datasets

# dataset for Baseline performance (full info)

CLF_METRIC = "Classification Tasks"
REG_METRIC = "Regression Tasks"

DOWNSTREAM_RESULT_TYPE = "downstream_performance_mean"
IMPUTE_RESULT_TYPE = "impute_performance_mean"

FIGURES_PATH = Path(f"../paper/figures/")

# Create new column for all datasets -> Data_Constellation_full (Task, Pattern, Fraction) -> only for Baseline required
baseline = pd.read_csv('multiclass_classification_fixed_seed.csv')

na_impute_results = baseline[
    (baseline["result_type"] == IMPUTE_RESULT_TYPE) & 
    (baseline["metric"].isin(["F1_macro", "RMSE"]))
]
na_impute_results.drop(["baseline", "corrupted", "imputed"], axis=1, inplace=True)
na_impute_results = na_impute_results[na_impute_results.isna().any(axis=1)]
na_impute_results.shape






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(45, 11)

In [126]:
STRATEGY_TYPE = "single_single"

baseline = baseline[
    (baseline["result_type"] == DOWNSTREAM_RESULT_TYPE) & 
    (baseline["metric"].isin(["F1_macro", "RMSE"]) &
    (baseline["strategy"] == STRATEGY_TYPE))
]

# remove experiments where imputation failed
baseline = baseline.merge(
    na_impute_results,
    how = "left",
    validate = "one_to_one",
    indicator = True,
    suffixes=("", "_imp"),
    on = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "column"]
)
baseline = baseline[baseline["_merge"]=="left_only"]

assert len(baseline["strategy"].unique()) == 1
baseline.drop(["experiment", "strategy", "result_type_imp", "metric_imp", "train", "test", "train_imp", "test_imp", "_merge"], axis=1, inplace=True)


In [127]:
baseline = baseline.rename(
    {
        "imputer": "Imputation_Method",
        "task": "Task",
        "missing_type": "Missing Type",
        "missing_fraction": "Missing Fraction",
        "column": "Column",
        "baseline": "Baseline",
        "imputed": "Imputed",
        "corrupted": "Corrupted"
    },
    axis = 1
)
rename_imputer_dict = {
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "KNN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
}

rename_metric_dict = {
    "F1_macro": CLF_METRIC,
    "RMSE": REG_METRIC
}
baseline = baseline.replace(rename_imputer_dict)
baseline = baseline.replace(rename_metric_dict)

baseline['Missing Type'] = baseline['Missing Type'].astype(str)
baseline['Missing Fraction'] = baseline['Missing Fraction'].astype(str)
baseline['Task'] = baseline['Task'].astype(str)

baseline['Data_Constellation_full'] = baseline['Missing Type'] + ' - ' + baseline['Missing Fraction'] + ' - ' + baseline['Task']

baseline

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,Data_Constellation_full
0,Discriminative DL,1459,MAR,0.01,V7,downstream_performance_mean,Classification Tasks,0.316824,0.313480,0.316924,MAR - 0.01 - 1459
1,Discriminative DL,1459,MAR,0.1,V7,downstream_performance_mean,Classification Tasks,0.316824,0.284694,0.316393,MAR - 0.1 - 1459
2,Discriminative DL,1459,MAR,0.3,V7,downstream_performance_mean,Classification Tasks,0.316824,0.241874,0.314436,MAR - 0.3 - 1459
3,Discriminative DL,1459,MAR,0.5,V7,downstream_performance_mean,Classification Tasks,0.316824,0.193047,0.315074,MAR - 0.5 - 1459
4,Discriminative DL,1459,MCAR,0.01,V7,downstream_performance_mean,Classification Tasks,0.316824,0.313858,0.316344,MCAR - 0.01 - 1459
...,...,...,...,...,...,...,...,...,...,...,...
1219,VAE,6,MCAR,0.5,x-box,downstream_performance_mean,Classification Tasks,0.722137,0.670112,0.713092,MCAR - 0.5 - 6
1220,VAE,6,MNAR,0.01,x-box,downstream_performance_mean,Classification Tasks,0.722137,0.720739,0.722054,MNAR - 0.01 - 6
1221,VAE,6,MNAR,0.1,x-box,downstream_performance_mean,Classification Tasks,0.722137,0.713637,0.721427,MNAR - 0.1 - 6
1222,VAE,6,MNAR,0.3,x-box,downstream_performance_mean,Classification Tasks,0.722137,0.689145,0.721398,MNAR - 0.3 - 6


In [128]:
baseline = baseline[['Imputation_Method','Baseline', 'Data_Constellation_full']]


In [129]:
# import required datasets

# dataset for Imputation Performance (full info)
imputation = pd.read_csv('../Multi/multi_imputed_full_info.csv')

# dataset for Subset performance (full info)
    # Rename "Performance Difference to Average Best" to "Performance Difference to Average Best Subset"
subset = pd.read_csv('../Subset - Multi/multi_subset_full_info.csv')
subset = subset.rename(columns={"Performance Difference to Average Best": "Performance Difference to Average Best Subset"}) 
subset = subset[['Imputation_Method','Imputed_Subset', 'Downstream Performance Rank Subset', 'Performance Difference to Average Best Subset', 'Data_Constellation_full']]

subset

Unnamed: 0,Imputation_Method,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Data_Constellation_full
0,Random Forest,0.640391,4.0,-0.005210,MAR - 0.01 - 6
1,KNN,0.645601,1.0,0.000000,MAR - 0.01 - 6
2,Mean/Mode,0.639284,5.0,-0.006317,MAR - 0.01 - 6
3,VAE,0.642368,2.0,-0.003233,MAR - 0.01 - 6
4,GAIN,0.640532,3.0,-0.005069,MAR - 0.01 - 6
...,...,...,...,...,...
1217,KNN,0.236748,3.0,0.000000,MNAR - 0.5 - 41671
1218,Mean/Mode,0.212846,4.0,-0.023902,MNAR - 0.5 - 41671
1219,VAE,0.264038,1.0,0.027290,MNAR - 0.5 - 41671
1220,GAIN,0.254694,2.0,0.017946,MNAR - 0.5 - 41671


In [130]:
# Merge Datasets to one dataframe via data constellation (Task, Pattern, Fraction)
# Imputation (full set), Baseline (Baseline Performance), Subset (Imputed_Subset, Downstream Performance Rank Subset, Data_Constellation_full)
data = pd.merge(imputation, subset, on=['Data_Constellation_full', 'Imputation_Method'])
data = pd.merge(data, baseline, on=['Data_Constellation_full', 'Imputation_Method'])
data = data.rename(columns={"Baseline_y": "Baseline"}) 

data


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,1.0,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.000000,0.640391,4.0,-0.005210,0.722137
1,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727724,0.0,0.727724,...,1.0,26.0,2.0,MAR - 0.01,MAR - 0.01 - 6,0.000649,0.645601,1.0,0.000000,0.722137
2,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725643,0.0,0.725766,...,1.0,26.0,5.0,MAR - 0.01,MAR - 0.01 - 6,-0.001309,0.639284,5.0,-0.006317,0.722137
3,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725821,0.0,0.725778,...,1.0,26.0,4.0,MAR - 0.01,MAR - 0.01 - 6,-0.001296,0.642368,2.0,-0.003233,0.722137
4,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727469,0.0,0.727828,...,1.0,26.0,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000753,0.637839,6.0,-0.007762,0.722137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166,Random Forest,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240619,...,1.0,5.0,6.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000000,0.188638,6.0,-0.048110,0.220637
1167,KNN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.264992,0.0,0.265395,...,1.0,5.0,1.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.024776,0.236748,3.0,0.000000,0.220637
1168,Mean/Mode,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239275,0.0,0.240780,...,1.0,5.0,4.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000160,0.212846,4.0,-0.023902,0.220637
1169,VAE,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.241995,0.0,0.242421,...,1.0,5.0,3.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.001801,0.264038,1.0,0.027290,0.220637


In [131]:
#data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1171 entries, 0 to 1170
Data columns (total 27 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Imputation_Method                              1171 non-null   object 
 1   Task                                           1171 non-null   int64  
 2   Missing Type                                   1171 non-null   object 
 3   Missing Fraction                               1171 non-null   float64
 4   Column                                         1171 non-null   object 
 5   result_type                                    1171 non-null   object 
 6   metric                                         1171 non-null   object 
 7   Baseline_x                                     1171 non-null   float64
 8   Corrupted                                      1171 non-null   float64
 9   Imputed                                        1171 

## Baseline Comparisons

In [133]:
# Calculate Difference in F1 Score/RMSE between Imputed and Subset for each data constellation -> save in new column
data['Performance Difference Baseline to Imputed'] = (data['Baseline']) - (data['Imputed'])
data

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.000000,0.640391,4.0,-0.005210,0.722137,-0.004937
1,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727724,0.0,0.727724,...,26.0,2.0,MAR - 0.01,MAR - 0.01 - 6,0.000649,0.645601,1.0,0.000000,0.722137,-0.005587
2,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725643,0.0,0.725766,...,26.0,5.0,MAR - 0.01,MAR - 0.01 - 6,-0.001309,0.639284,5.0,-0.006317,0.722137,-0.003628
3,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725821,0.0,0.725778,...,26.0,4.0,MAR - 0.01,MAR - 0.01 - 6,-0.001296,0.642368,2.0,-0.003233,0.722137,-0.003641
4,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727469,0.0,0.727828,...,26.0,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000753,0.637839,6.0,-0.007762,0.722137,-0.005690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166,Random Forest,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240619,...,5.0,6.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000000,0.188638,6.0,-0.048110,0.220637,-0.019982
1167,KNN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.264992,0.0,0.265395,...,5.0,1.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.024776,0.236748,3.0,0.000000,0.220637,-0.044758
1168,Mean/Mode,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239275,0.0,0.240780,...,5.0,4.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000160,0.212846,4.0,-0.023902,0.220637,-0.020142
1169,VAE,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.241995,0.0,0.242421,...,5.0,3.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.001801,0.264038,1.0,0.027290,0.220637,-0.021784


In [134]:
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset
Average_Difference_Score = data['Performance Difference Baseline to Imputed'].mean()
print("Average Difference in Predicitve Performance between baseline and full, perturbed dataset", Average_Difference_Score)
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset -> absolut value
data_temp_abs = data.copy()
data_temp_abs['Performance Difference Baseline to Imputed'] = data_temp_abs['Performance Difference Baseline to Imputed'].abs()

Average_Difference_Score_abs = data_temp_abs['Performance Difference Baseline to Imputed'].mean()
print("Average Difference in Predicitve Performance between baseline and full, perturbed dataset as absolute Values", Average_Difference_Score_abs)


Average Difference in Predicitve Performance between baseline and full, perturbed dataset 0.011830800658146961
Average Difference in Predicitve Performance between baseline and full, perturbed dataset as absolute Values 0.022082752485504432
____________________




In [135]:
improv_to_av_best = data.copy()
improv_to_av_best = improv_to_av_best[improv_to_av_best["Imputation_Method"].str.contains("Random Forest")]
improv_to_av_best_mean = improv_to_av_best['Performance Difference Baseline to Imputed'].mean()
print(improv_to_av_best_mean)
#improv_to_av_best

0.011562031358450044


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.0,0.640391,4.0,-0.005210,0.722137,-0.004937
5,Random Forest,6,MAR,0.10,x-box,downstream_performance_mean,F1_macro,0.724750,0.0,0.725023,...,26.0,3.0,MAR - 0.1,MAR - 0.1 - 6,0.0,0.633935,2.0,0.004739,0.722137,-0.002886
9,Random Forest,6,MAR,0.30,x-box,downstream_performance_mean,F1_macro,0.722413,0.0,0.722508,...,26.0,3.0,MAR - 0.3,MAR - 0.3 - 6,0.0,0.640362,2.0,0.011533,0.722137,-0.000370
14,Random Forest,6,MAR,0.50,x-box,downstream_performance_mean,F1_macro,0.725631,0.0,0.726487,...,26.0,1.0,MAR - 0.5,MAR - 0.5 - 6,0.0,0.634458,5.0,-0.010654,0.722137,-0.004350
19,Random Forest,6,MCAR,0.01,x-box,downstream_performance_mean,F1_macro,0.726720,0.0,0.726623,...,26.0,2.0,MCAR - 0.01,MCAR - 0.01 - 6,0.0,0.636546,6.0,-0.001227,0.722137,-0.004486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1146,Random Forest,41671,MCAR,0.50,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240411,...,5.0,3.0,MCAR - 0.5,MCAR - 0.5 - 41671,0.0,0.196150,6.0,-0.042483,0.220637,-0.019774
1151,Random Forest,41671,MNAR,0.01,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240619,...,5.0,2.0,MNAR - 0.01,MNAR - 0.01 - 41671,0.0,0.174025,5.0,-0.079189,0.220637,-0.019982
1156,Random Forest,41671,MNAR,0.10,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240619,...,5.0,2.0,MNAR - 0.1,MNAR - 0.1 - 41671,0.0,0.174025,6.0,-0.042550,0.220637,-0.019982
1161,Random Forest,41671,MNAR,0.30,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240619,...,5.0,3.0,MNAR - 0.3,MNAR - 0.3 - 41671,0.0,0.174025,6.0,-0.058748,0.220637,-0.019982


In [136]:
data_heatmaps = data.copy()
data_backup = data.copy()

data_heatmaps['Missing Fraction'] = data_heatmaps['Missing Fraction'].astype(str)
data_heatmaps['Missing Type'] = data_heatmaps['Missing Type'].astype(str)
#data_heatmaps

In [137]:
# Heatmap with differences per Dataconstellation Baseline to full, perturbed dataset
data_heat = data.copy()
data_heat = data_heat.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = data_heat.loc[data_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference Baseline to Imputed"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.11),
                    zmax=0.11,
                    #hoverinfo='text',
                    #text=hovertext
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("multi_heatmap_f1_score_improvement_Baseline_to_Imputed%s.pdf" %i)

    # Positive value indicates, that Baseline is better than full, corrupted dataset score 

In [138]:
# sorting data by total improvement
df_quantiles = data_heatmaps.copy()

df_quantiles
#df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)
df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.09)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.07)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.05)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.03)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.01)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.01)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.03)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.05)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.07)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [139]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []

quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Performance Difference Baseline to Imputed': quantiles,
     'Amount': quantile_freq,
    })


[0, 2, 14, 44, 136, 434, 265, 167, 80, 26, 3]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [140]:
fig = px.bar(improvement_quantiles, x='Performance Difference Baseline to Imputed', y='Amount')
fig.show()
fig.write_image("multi_performance_difference_baseline_to_imputed.pdf")

In [142]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []


for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[0, 1, 3, 5, 29, 71, 48, 33, 11, 3, 0]
[0, 0, 2, 9, 28, 80, 37, 29, 14, 4, 1]
[0, 0, 4, 12, 24, 78, 47, 23, 11, 5, 0]
[0, 0, 0, 3, 25, 90, 43, 18, 15, 3, 0]
[0, 1, 2, 8, 22, 65, 46, 35, 18, 5, 2]
[0, 0, 3, 7, 8, 50, 44, 29, 11, 6, 0]


In [143]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("multi_performance_difference_baseline_to_imputed_per_method.pdf")

In [144]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()

        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[0, 0, 2, 5, 37, 155, 49, 38, 4, 0, 1]
[0, 1, 5, 13, 31, 113, 72, 32, 18, 7, 0]
[0, 1, 5, 6, 34, 88, 81, 50, 23, 5, 1]
[0, 0, 2, 20, 34, 78, 63, 47, 35, 14, 1]


In [145]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("multi_performance_difference_baseline_to_imputed_per_frac.pdf")

In [146]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()

        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 
                                   
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)


['MCAR', 'MAR', 'MNAR']
[0, 1, 6, 13, 46, 145, 83, 63, 22, 9, 1]
[0, 0, 4, 18, 44, 138, 92, 56, 27, 11, 0]
[0, 1, 4, 13, 46, 151, 90, 48, 31, 6, 2]


In [147]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("multi_performance_difference_baseline_to_imputed_per_patt.pdf")

## Subset Comparison

In [148]:
# Subset Comparison Analysis here
data = data_backup.copy()
# Calculate Difference in F1 Score/RMSE between Imputed and Subset for each data constellation -> save in new column
data['Performance Difference Imputed to Subset'] = (data['Imputed']) - (data['Imputed_Subset'])

# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset
Average_Difference_Score = data['Performance Difference Imputed to Subset'].mean()
print("Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset -> absolut value
data_temp_abs = data.copy()
data_temp_abs['Performance Difference Imputed to Subset'] = data_temp_abs['Performance Difference Imputed to Subset'].abs()

Average_Difference_Score_abs = data_temp_abs['Performance Difference Imputed to Subset'].mean()
print("Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
print('____________________')
print('\n')
# Filter full dataset for Imputation Method
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Imputation Method
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL'] 
for i in methods:
    data_method = data.loc[data['Imputation_Method'] == i]
    Average_Difference_Score = data_method['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_method['Performance Difference Imputed to Subset'] = data_method['Performance Difference Imputed to Subset'].abs()
    Average_Difference_Score_abs = data_method['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')
    
# Filter full dataset for Missingness Pattern
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Missingness Pattern
patterns = ['MCAR', 'MAR', 'MNAR']
for i in patterns:
    data_patterns = data.loc[data['Missing Type'] == i]
    Average_Difference_Score = data_patterns['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_patterns['Performance Difference Imputed to Subset'] = data_patterns['Performance Difference Imputed to Subset'].abs()
    Average_Difference_Score_abs = data_patterns['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')    
    
# Filter full dataset for Missingness Fraction
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Missingness Fraction
print("_______________________________________________________________________________")
fractions = [0.01, 0.1, 0.3, 0.5]
for i in fractions:
#    print(i)
    data_fractions = data.loc[data['Missing Fraction'] == i]
#    print(data_fractions)
    Average_Difference_Score = data_fractions['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_fractions['Performance Difference Imputed to Subset'] = data_fractions['Performance Difference Imputed to Subset'].abs()
    Average_Difference_Score_abs = data_fractions['Performance Difference Imputed to Subset'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')    





Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.0028001692615625996
Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.0808616578796111
____________________


Random Forest Average Difference in Predicitve Performance between full, perturbed dataset and subset -0.00024172282591256958
Random Forest Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.08264515702272031
____________________


KNN Average Difference in Predicitve Performance between full, perturbed dataset and subset -0.003991010716785635
KNN Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.08447891064320223
____________________


Mean/Mode Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.0012313370742273455
Mean/Mode Average Difference in Predicitve Performance between 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [149]:
data_heatmaps = data.copy()

data_heatmaps['Missing Fraction'] = data_heatmaps['Missing Fraction'].astype(str)
data_heatmaps['Missing Type'] = data_heatmaps['Missing Type'].astype(str)

In [151]:
# Heatmap with differences per Dataconstellation relative to full, perturbed dataset
data_heat = data.copy()
data_heat = data_heat.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = data_heat.loc[data_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference Imputed to Subset"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.11),
                    zmax=0.11,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("multi_heatmap_f1_score_improvement_Imputed_to_Subset%s.pdf" %i)
    # Positive value indicates, that full, corrupted dataset score is better than the score for the subset

In [152]:
# sorting data by total improvement
df_quantiles = data_heatmaps.copy()

df_quantiles
#df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)
df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.09)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.07)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.05)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.03)) | (df_quantiles["Performance Difference Imputed to Subset"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (-0.01)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.01)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.03)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.05)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.07)) | (df_quantiles["Performance Difference Imputed to Subset"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [153]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []

quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Performance Difference Imputed to Subset': quantiles,
     'Amount': quantile_freq,
    })


[212, 19, 22, 67, 87, 156, 148, 161, 97, 79, 123]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [154]:
fig = px.bar(improvement_quantiles, x='Performance Difference Imputed to Subset', y='Amount')
fig.show()
fig.write_image("multi_performance_difference_imputed_to_subset.pdf")
#1171

In [155]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[37, 5, 7, 14, 12, 25, 24, 27, 19, 15, 19]
[39, 3, 2, 15, 15, 31, 23, 25, 15, 17, 19]
[38, 4, 5, 11, 15, 26, 28, 27, 17, 12, 21]
[37, 2, 3, 13, 9, 30, 26, 31, 14, 10, 22]
[34, 2, 2, 8, 22, 22, 26, 31, 16, 18, 23]
[27, 3, 3, 6, 14, 22, 21, 20, 16, 7, 19]


In [156]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("multi_performance_difference_imputed_to_subset_per_method.pdf")

In [157]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']

print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[58, 4, 2, 15, 21, 48, 31, 34, 24, 24, 30]
[56, 4, 6, 22, 17, 36, 42, 43, 19, 19, 28]
[53, 8, 7, 12, 22, 36, 43, 46, 25, 13, 29]
[45, 3, 7, 18, 27, 36, 32, 38, 29, 23, 36]


In [158]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("multi_performance_difference_imputed_to_subset_per_frac.pdf")

In [159]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]

        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 
                                    
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)


['MCAR', 'MAR', 'MNAR']
[72, 5, 9, 26, 31, 51, 46, 53, 30, 24, 42]
[70, 8, 9, 21, 27, 53, 50, 56, 25, 34, 37]
[70, 6, 4, 20, 29, 52, 52, 52, 42, 21, 44]


In [160]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("multi_performance_difference_imputed_to_subset_per_patt.pdf")

## Comparison based on Rank

In [161]:

subset_rank_acc = data_heatmaps.copy()
subset_rank_acc


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset,Baseline,Performance Difference Baseline to Imputed,Performance Difference Imputed to Subset
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,3.0,MAR - 0.01,MAR - 0.01 - 6,0.000000,0.640391,4.0,-0.005210,0.722137,-0.004937,0.086684
1,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727724,0.0,0.727724,...,2.0,MAR - 0.01,MAR - 0.01 - 6,0.000649,0.645601,1.0,0.000000,0.722137,-0.005587,0.082123
2,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725643,0.0,0.725766,...,5.0,MAR - 0.01,MAR - 0.01 - 6,-0.001309,0.639284,5.0,-0.006317,0.722137,-0.003628,0.086482
3,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725821,0.0,0.725778,...,4.0,MAR - 0.01,MAR - 0.01 - 6,-0.001296,0.642368,2.0,-0.003233,0.722137,-0.003641,0.083410
4,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727469,0.0,0.727828,...,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000753,0.637839,6.0,-0.007762,0.722137,-0.005690,0.089989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166,Random Forest,41671,MNAR,0.5,a9,downstream_performance_mean,F1_macro,0.240619,0.0,0.240619,...,6.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000000,0.188638,6.0,-0.048110,0.220637,-0.019982,0.051981
1167,KNN,41671,MNAR,0.5,a9,downstream_performance_mean,F1_macro,0.264992,0.0,0.265395,...,1.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.024776,0.236748,3.0,0.000000,0.220637,-0.044758,0.028647
1168,Mean/Mode,41671,MNAR,0.5,a9,downstream_performance_mean,F1_macro,0.239275,0.0,0.240780,...,4.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000160,0.212846,4.0,-0.023902,0.220637,-0.020142,0.027934
1169,VAE,41671,MNAR,0.5,a9,downstream_performance_mean,F1_macro,0.241995,0.0,0.242421,...,3.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.001801,0.264038,1.0,0.027290,0.220637,-0.021784,-0.021617


In [162]:
subset_rank_acc = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank'] == 1.0]
print(len(subset_rank_acc))
subset_rank_acc_right = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank Subset'] == 1.0]
print(len(subset_rank_acc_right))
#subset_rank_acc_right.to_csv('subset_rank_acc_right.csv')
subset_rank_acc_wrong = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank Subset'] != 1.0]
print(len(subset_rank_acc_wrong))
#subset_rank_acc_wrong.to_csv('subset_rank_acc_wrong.csv')

199
39
160


In [163]:
print(len(subset_rank_acc))
subset_rank_acc_mean_diff = subset_rank_acc['Performance Difference to Average Best Subset'].mean()
print(subset_rank_acc_mean_diff)

199
-0.002475135852845068


In [164]:
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []

for i in methods:
    df_temp = subset_rank_acc_right.loc[subset_rank_acc_right['Imputation_Method'] == i]
    df_temp_len = len(df_temp.index)
    if (i == 'Random Forest'):
        forest_freq.append(df_temp_len)
    elif (i == 'KNN'):
        knn_freq.append(df_temp_len)                                       
    elif (i == 'Mean/Mode'):
        mode_freq.append(df_temp_len)                                                 
    elif (i == 'Discriminative DL'):
        dl_freq.append(df_temp_len)                                       
    elif (i == 'VAE'):
        vae_freq.append(df_temp_len)                                         
    elif (i == 'GAIN'):
        gain_freq.append(df_temp_len)                                          
print("Subset with same predicition as full, corrupted")
print(forest_freq, 'Random Forest')
print(knn_freq, 'KNN')
print(mode_freq, 'Mode')
print(dl_freq, 'DL')
print(vae_freq, 'VAE')
print(gain_freq, 'GAIN')

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
Subset with same predicition as full, corrupted
[6] Random Forest
[9] KNN
[7] Mode
[5] DL
[7] VAE
[5] GAIN


In [165]:
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []

for i in methods:
    df_temp = subset_rank_acc_wrong.loc[subset_rank_acc_wrong['Imputation_Method'] == i]
    df_temp_len = len(df_temp.index)
    if (i == 'Random Forest'):
        forest_freq.append(df_temp_len)
    elif (i == 'KNN'):
        knn_freq.append(df_temp_len)                                       
    elif (i == 'Mean/Mode'):
        mode_freq.append(df_temp_len)                                                 
    elif (i == 'Discriminative DL'):
        dl_freq.append(df_temp_len)                                       
    elif (i == 'VAE'):
        vae_freq.append(df_temp_len)                                         
    elif (i == 'GAIN'):
        gain_freq.append(df_temp_len)                                          

print("Subset with different predicition than full, corrupted")
print(forest_freq, 'Random Forest')
print(knn_freq, 'KNN')
print(mode_freq, 'Mode')
print(dl_freq, 'DL')
print(vae_freq, 'VAE')
print(gain_freq, 'GAIN')

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
Subset with different predicition than full, corrupted
[28] Random Forest
[27] KNN
[28] Mode
[18] DL
[34] VAE
[25] GAIN
