# Regression - Comparisons Imputed - Baseline - Subset

This notebook aims to compare the performance of the imputation experiments with the baseline (training on fully observed data) as well as the imputation performance on the complete, perturbed dataset compared to the performance on the subset of the dataset.


In [174]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns

from pathlib import Path

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr

In [175]:
# import required datasets

# dataset for Baseline performance (full info)

CLF_METRIC = "Classification Tasks"
REG_METRIC = "Regression Tasks"

DOWNSTREAM_RESULT_TYPE = "downstream_performance_mean"
IMPUTE_RESULT_TYPE = "impute_performance_mean"

FIGURES_PATH = Path(f"../paper/figures/")

# Create new column for all datasets -> Data_Constellation_full (Task, Pattern, Fraction) -> only for Baseline required
baseline = pd.read_csv('regression_fixed_seed.csv')

na_impute_results = baseline[
    (baseline["result_type"] == IMPUTE_RESULT_TYPE) & 
    (baseline["metric"].isin(["F1_macro", "RMSE"]))
]
na_impute_results.drop(["baseline", "corrupted", "imputed"], axis=1, inplace=True)
na_impute_results = na_impute_results[na_impute_results.isna().any(axis=1)]
na_impute_results.shape




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(64, 11)

In [176]:
STRATEGY_TYPE = "single_single"

baseline = baseline[
    (baseline["result_type"] == DOWNSTREAM_RESULT_TYPE) & 
    (baseline["metric"].isin(["F1_macro", "RMSE"]) &
    (baseline["strategy"] == STRATEGY_TYPE))
]

# remove experiments where imputation failed
baseline = baseline.merge(
    na_impute_results,
    how = "left",
    validate = "one_to_one",
    indicator = True,
    suffixes=("", "_imp"),
    on = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "column"]
)
baseline = baseline[baseline["_merge"]=="left_only"]

assert len(baseline["strategy"].unique()) == 1
baseline.drop(["experiment", "strategy", "result_type_imp", "metric_imp", "train", "test", "train_imp", "test_imp", "_merge"], axis=1, inplace=True)


In [177]:
baseline = baseline.rename(
    {
        "imputer": "Imputation_Method",
        "task": "Task",
        "missing_type": "Missing Type",
        "missing_fraction": "Missing Fraction",
        "column": "Column",
        "baseline": "Baseline",
        "imputed": "Imputed",
        "corrupted": "Corrupted"
    },
    axis = 1
)
rename_imputer_dict = {
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "KNN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
}

rename_metric_dict = {
    "F1_macro": CLF_METRIC,
    "RMSE": REG_METRIC
}
baseline = baseline.replace(rename_imputer_dict)
baseline = baseline.replace(rename_metric_dict)

baseline['Missing Type'] = baseline['Missing Type'].astype(str)
baseline['Missing Fraction'] = baseline['Missing Fraction'].astype(str)
baseline['Task'] = baseline['Task'].astype(str)


baseline['Data_Constellation_full'] = baseline['Missing Type'] + ' - ' + baseline['Missing Fraction'] + ' - ' + baseline['Task']

#baseline

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,Data_Constellation_full
0,Discriminative DL,1193,MAR,0.01,UI,downstream_performance_mean,Regression Tasks,949.295595,951.575449,948.902260,MAR - 0.01 - 1193
1,Discriminative DL,1193,MAR,0.1,UI,downstream_performance_mean,Regression Tasks,949.295595,973.066467,947.250208,MAR - 0.1 - 1193
2,Discriminative DL,1193,MAR,0.3,UI,downstream_performance_mean,Regression Tasks,949.295595,1027.539870,942.516397,MAR - 0.3 - 1193
3,Discriminative DL,1193,MAR,0.5,UI,downstream_performance_mean,Regression Tasks,949.295595,1086.215423,938.286502,MAR - 0.5 - 1193
4,Discriminative DL,1193,MCAR,0.01,UI,downstream_performance_mean,Regression Tasks,949.295595,952.165727,949.007645,MCAR - 0.01 - 1193
...,...,...,...,...,...,...,...,...,...,...,...
1327,VAE,42712,MCAR,0.5,humidity,downstream_performance_mean,Regression Tasks,149.495696,151.006204,151.652660,MCAR - 0.5 - 42712
1328,VAE,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.495696,149.353262,149.735954,MNAR - 0.01 - 42712
1329,VAE,42712,MNAR,0.1,humidity,downstream_performance_mean,Regression Tasks,149.495696,148.346460,150.970100,MNAR - 0.1 - 42712
1330,VAE,42712,MNAR,0.3,humidity,downstream_performance_mean,Regression Tasks,149.495696,152.929558,150.113720,MNAR - 0.3 - 42712


In [178]:
baseline = baseline[['Imputation_Method','Baseline', 'Data_Constellation_full']]
baseline

In [180]:
# import required datasets

# dataset for Imputation Performance (full info)
imputation = pd.read_csv('../Regression/regression_imputed_full_info.csv')


#imputation


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent
0,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,,9.0,8192.0,9.0,0.0,,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000000
1,KNN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204892,0.0,0.205038,...,,9.0,8192.0,9.0,0.0,,6.0,MAR - 0.01,MAR - 0.01 - 189,-0.000189
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,,9.0,8192.0,9.0,0.0,,3.0,MAR - 0.01,MAR - 0.01 - 189,-0.000117
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,,9.0,8192.0,9.0,0.0,,4.0,MAR - 0.01,MAR - 0.01 - 189,-0.000131
4,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337,KNN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,,13.0,17379.0,9.0,4.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.005164
1338,Mean/Mode,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,147.017846,0.0,146.311138,...,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.009543
1339,VAE,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,,13.0,17379.0,9.0,4.0,,5.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.010457
1340,GAIN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,148.397969,0.0,149.936478,...,,13.0,17379.0,9.0,4.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.014867


In [181]:
# dataset for Subset performance (full info)
    # Rename "Performance Difference to Average Best" to "Performance Difference to Average Best Subset"
subset = pd.read_csv('../Subset - Regression/regression_subset_full_info.csv')
subset = subset.rename(columns={"Performance Difference to Average Best in Percent": "Performance Difference to Average Best Subset in Percent"}) 
subset = subset[['Imputation_Method','Imputed_Subset', 'Downstream Performance Rank Subset', 'Performance Difference to Average Best Subset in Percent', 'Data_Constellation_full']]
subset



Unnamed: 0,Imputation_Method,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset in Percent,Data_Constellation_full
0,Random Forest,0.225470,6.0,0.000000,MAR - 0.01 - 189
1,KNN,0.224708,2.0,0.003392,MAR - 0.01 - 189
2,Mean/Mode,0.224662,1.0,0.003597,MAR - 0.01 - 189
3,VAE,0.225196,3.0,0.001218,MAR - 0.01 - 189
4,GAIN,0.225407,5.0,0.000280,MAR - 0.01 - 189
...,...,...,...,...,...
1358,KNN,205.719979,6.0,-0.015257,MNAR - 0.5 - 42712
1359,Mean/Mode,205.088208,4.0,-0.012223,MNAR - 0.5 - 42712
1360,VAE,205.193612,5.0,-0.012731,MNAR - 0.5 - 42712
1361,GAIN,203.723620,3.0,-0.005607,MNAR - 0.5 - 42712


In [182]:
# Merge Datasets to one dataframe via data constellation (Task, Pattern, Fraction)
# Imputation (full set), Baseline (Baseline Performance), Subset (Imputed_Subset, Downstream Performance Rank Subset, Data_Constellation_full)
data = pd.merge(imputation, subset, on=['Data_Constellation_full', 'Imputation_Method'])

data = pd.merge(data, baseline, on=['Data_Constellation_full', 'Imputation_Method'])

data = data.rename(columns={"Baseline_y": "Baseline"}) 
data


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset in Percent,Baseline
0,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,0.0,,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000000,0.225470,6.0,0.000000,0.205116
1,KNN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204892,0.0,0.205038,...,0.0,,6.0,MAR - 0.01,MAR - 0.01 - 189,-0.000189,0.224708,2.0,0.003392,0.205116
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,0.0,,3.0,MAR - 0.01,MAR - 0.01 - 189,-0.000117,0.224662,1.0,0.003597,0.205116
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,0.0,,4.0,MAR - 0.01,MAR - 0.01 - 189,-0.000131,0.225196,3.0,0.001218,0.205116
4,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,0.0,,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356,0.225407,5.0,0.000280,0.205116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,KNN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,4.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.005164,205.719979,6.0,-0.015257,149.495696
1258,Mean/Mode,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,147.017846,0.0,146.311138,...,4.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.009543,205.088208,4.0,-0.012223,149.495696
1259,VAE,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,4.0,,5.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.010457,205.193612,5.0,-0.012731,149.495696
1260,GAIN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,148.397969,0.0,149.936478,...,4.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.014867,203.723620,3.0,-0.005607,149.495696


## Baseline Comparisons

In [185]:
# Calculate Difference in F1 Score/RMSE between Imputed and Subset for each data constellation -> save in new column
data['Performance Difference Baseline to Imputed'] = (((data['Baseline']) - (data['Imputed'])) / (data['Baseline']))
data['Performance Difference Baseline to Imputed'] = (data['Performance Difference Baseline to Imputed']*(-1))
data

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset in Percent,Baseline,Performance Difference Baseline to Imputed
0,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000000,0.225470,6.0,0.000000,0.205116,-0.000569
1,KNN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204892,0.0,0.205038,...,,6.0,MAR - 0.01,MAR - 0.01 - 189,-0.000189,0.224708,2.0,0.003392,0.205116,-0.000380
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,,3.0,MAR - 0.01,MAR - 0.01 - 189,-0.000117,0.224662,1.0,0.003597,0.205116,-0.000452
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,,4.0,MAR - 0.01,MAR - 0.01 - 189,-0.000131,0.225196,3.0,0.001218,0.205116,-0.000438
4,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356,0.225407,5.0,0.000280,0.205116,-0.000925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,KNN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,,3.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.005164,205.719979,6.0,-0.015257,149.495696,-0.006834
1258,Mean/Mode,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,147.017846,0.0,146.311138,...,,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.009543,205.088208,4.0,-0.012223,149.495696,-0.021302
1259,VAE,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,,5.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.010457,205.193612,5.0,-0.012731,149.495696,-0.001522
1260,GAIN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,148.397969,0.0,149.936478,...,,6.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.014867,203.723620,3.0,-0.005607,149.495696,0.002948


In [186]:
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset
Average_Difference_Score = data['Performance Difference Baseline to Imputed'].mean()
print("Average Difference in Predicitve Performance between baseline and full, perturbed dataset", Average_Difference_Score)
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset -> absolut value
data_temp_abs = data.copy()
data_temp_abs['Performance Difference Baseline to Imputed'] = data_temp_abs['Performance Difference Baseline to Imputed'].abs()

Average_Difference_Score_abs = data_temp_abs['Performance Difference Baseline to Imputed'].mean()
print("Average Difference in Predicitve Performance between baseline and full, perturbed dataset as absolute Values", Average_Difference_Score_abs)


Average Difference in Predicitve Performance between baseline and full, perturbed dataset 0.005989274485661973
Average Difference in Predicitve Performance between baseline and full, perturbed dataset as absolute Values 0.007245207953167799
____________________




In [187]:
improv_to_av_best = data.copy()
improv_to_av_best = improv_to_av_best[improv_to_av_best["Imputation_Method"].str.contains("Random Forest")]
improv_to_av_best_mean = improv_to_av_best['Performance Difference Baseline to Imputed'].mean()
print(improv_to_av_best_mean)
#improv_to_av_best

0.004214019671502785


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset in Percent,Baseline,Performance Difference Baseline to Imputed
0,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,,2.0,MAR - 0.01,MAR - 0.01 - 189,0.0,0.225470,6.0,0.0,0.205116,-0.000569
6,Random Forest,189,MAR,0.10,theta8,downstream_performance_mean,Regression Tasks,0.204910,0.0,0.205008,...,,3.0,MAR - 0.1,MAR - 0.1 - 189,0.0,0.225864,5.0,0.0,0.205116,-0.000522
12,Random Forest,189,MAR,0.30,theta8,downstream_performance_mean,Regression Tasks,0.204800,0.0,0.205515,...,,3.0,MAR - 0.3,MAR - 0.3 - 189,0.0,0.226752,6.0,0.0,0.205116,0.001950
18,Random Forest,189,MAR,0.50,theta8,downstream_performance_mean,Regression Tasks,0.206002,0.0,0.205979,...,,4.0,MAR - 0.5,MAR - 0.5 - 189,0.0,0.223393,3.0,0.0,0.205116,0.004209
24,Random Forest,189,MCAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.205079,0.0,0.205077,...,,5.0,MCAR - 0.01,MCAR - 0.01 - 189,0.0,0.225389,5.0,0.0,0.205116,-0.000188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233,Random Forest,42712,MCAR,0.50,humidity,downstream_performance_mean,Regression Tasks,148.991778,0.0,148.880924,...,,1.0,MCAR - 0.5,MCAR - 0.5 - 42712,0.0,204.639271,1.0,0.0,149.495696,-0.004112
1238,Random Forest,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.513804,0.0,149.548996,...,,4.0,MNAR - 0.01,MNAR - 0.01 - 42712,0.0,205.211177,1.0,0.0,149.495696,0.000357
1244,Random Forest,42712,MNAR,0.10,humidity,downstream_performance_mean,Regression Tasks,149.557789,0.0,149.782113,...,,4.0,MNAR - 0.1,MNAR - 0.1 - 42712,0.0,205.400159,1.0,0.0,149.495696,0.001916
1250,Random Forest,42712,MNAR,0.30,humidity,downstream_performance_mean,Regression Tasks,149.430117,0.0,150.008179,...,,3.0,MNAR - 0.3,MNAR - 0.3 - 42712,0.0,202.969127,2.0,0.0,149.495696,0.003428


In [188]:
data_heatmaps = data.copy()
data_backup = data.copy()

data_heatmaps['Missing Fraction'] = data_heatmaps['Missing Fraction'].astype(str)
data_heatmaps['Missing Type'] = data_heatmaps['Missing Type'].astype(str)

In [189]:
# Heatmap with differences per Dataconstellation Baseline to full, perturbed dataset
data_heat = data.copy()
data_heat = data_heat.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = data_heat.loc[data_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference Baseline to Imputed"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.11),
                    zmax=0.11,
                    #hoverinfo='text',
                    #text=hovertext
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("regression_heatmap_f1_score_improvement_Baseline_to_Imputed%s.pdf" %i)

    # Positive value indicates, that Baseline is better than full, corrupted dataset score 

In [190]:
# sorting data by total improvement
df_quantiles = data_heatmaps.copy()

df_quantiles
#df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)
df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.09)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.07)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.05)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.03)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (-0.01)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.01)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.03)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.05)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.07)) | (df_quantiles["Performance Difference Baseline to Imputed"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Baseline to Imputed"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [191]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []

quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Performance Difference Baseline to Imputed': quantiles,
     'Amount': quantile_freq,
    })


[1, 0, 0, 0, 23, 1059, 94, 48, 10, 8, 19]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [192]:
fig = px.bar(improvement_quantiles, x='Performance Difference Baseline to Imputed', y='Amount')
fig.show()
fig.write_image("regression_performance_difference_baseline_to_imputed.pdf")
# Negative values indicate a better performance of the model that was trained on fully observed data (Baseline)


In [193]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)


for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]

        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[0, 0, 0, 0, 4, 192, 20, 11, 0, 0, 1]
[0, 0, 0, 0, 4, 192, 21, 7, 3, 0, 1]
[0, 0, 0, 0, 6, 186, 16, 6, 5, 6, 3]
[0, 0, 0, 0, 5, 180, 16, 9, 1, 0, 1]
[1, 0, 0, 0, 4, 172, 15, 13, 1, 1, 9]
[0, 0, 0, 0, 0, 137, 6, 2, 0, 1, 4]


In [194]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("regression_performance_difference_baseline_to_imputed_per_method.pdf")

In [195]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
#print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
               
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]

        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[0, 0, 0, 0, 0, 314, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 287, 20, 9, 0, 1, 1]
[1, 0, 0, 0, 11, 237, 44, 9, 6, 2, 6]
[0, 0, 0, 0, 12, 221, 30, 30, 4, 5, 12]


In [196]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("regression_performance_difference_baseline_to_imputed_per_frac.pdf")

In [197]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
#print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []


for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()

        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 
                                                                          
print(freq_001)
print(freq_01)
print(freq_03)


['MCAR', 'MAR', 'MNAR']
[0, 0, 0, 0, 9, 357, 29, 14, 3, 4, 4]
[0, 0, 0, 0, 10, 354, 33, 15, 3, 2, 5]
[1, 0, 0, 0, 4, 348, 32, 19, 4, 2, 10]


In [198]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("regression_performance_difference_baseline_to_imputed_per_patt.pdf")

## Subset Comparisons

In [199]:
# Subset Comparison Analysis here
data = data_backup.copy()
# Calculate Difference in F1 Score/RMSE between Imputed and Subset for each data constellation -> save in new column
data['Performance Difference Imputed to Subset in Percent'] = (((data['Imputed']) - (data['Imputed_Subset']))/(data['Imputed']))
data['Performance Difference Imputed to Subset in Percent'] = (data['Performance Difference Imputed to Subset in Percent']*(-1))

# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset
Average_Difference_Score = data['Performance Difference Imputed to Subset in Percent'].mean()
print("Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
# Calculate Average Difference in F1 Score/RMSE between Imputed and Subset -> absolut value
data_temp_abs = data.copy()
data_temp_abs['Performance Difference Imputed to Subset in Percent'] = data_temp_abs['Performance Difference Imputed to Subset in Percent'].abs()

Average_Difference_Score_abs = data_temp_abs['Performance Difference Imputed to Subset in Percent'].mean()
print("Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
print('____________________')
print('\n')
# Filter full dataset for Imputation Method
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Imputation Method
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL'] 
for i in methods:
    data_method = data.loc[data['Imputation_Method'] == i]
    Average_Difference_Score = data_method['Performance Difference Imputed to Subset in Percent'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_method['Performance Difference Imputed to Subset'] = data_method['Performance Difference Imputed to Subset in Percent'].abs()
    Average_Difference_Score_abs = data_method['Performance Difference Imputed to Subset in Percent'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')
    
# Filter full dataset for Missingness Pattern
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Missingness Pattern
patterns = ['MCAR', 'MAR', 'MNAR']
for i in patterns:
    data_patterns = data.loc[data['Missing Type'] == i]
    Average_Difference_Score = data_patterns['Performance Difference Imputed to Subset in Percent'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_patterns['Performance Difference Imputed to Subset in Percent'] = data_patterns['Performance Difference Imputed to Subset in Percent'].abs()
    Average_Difference_Score_abs = data_patterns['Performance Difference Imputed to Subset in Percent'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
    print('____________________')
    print('\n')    
    
# Filter full dataset for Missingness Fraction
    # Calculate Average Difference in F1 Score/RMSE Imputed and Subset per Missingness Fraction
print("_______________________________________________________________________________")
fractions = [0.01, 0.1, 0.3, 0.5]
for i in fractions:
#    print(i)
    data_fractions = data.loc[data['Missing Fraction'] == i]
#    print(data_fractions)
    Average_Difference_Score = data_fractions['Performance Difference Imputed to Subset in Percent'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset", Average_Difference_Score)
    
    data_fractions['Performance Difference Imputed to Subset in Percent'] = data_fractions['Performance Difference Imputed to Subset in Percent'].abs()
    Average_Difference_Score_abs = data_fractions['Performance Difference Imputed to Subset in Percent'].mean()
    print(i, "Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values", Average_Difference_Score_abs)
   





Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.4850237509158049
Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.54743475977814
____________________


Random Forest Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.46683298703684584
Random Forest Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.46683298703684584
____________________


KNN Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.46711436254606
KNN Average Difference in Predicitve Performance between full, perturbed dataset and subset as absolute Values 0.46711436254606
____________________


Mean/Mode Average Difference in Predicitve Performance between full, perturbed dataset and subset 0.466021956490183
Mean/Mode Average Difference in Predicitve Performance between full, perturbed datas



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [200]:
#data

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset in Percent,Baseline,Performance Difference Baseline to Imputed,Performance Difference Imputed to Subset in Percent
0,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000000,0.225470,6.0,0.000000,0.205116,-0.000569,0.099861
1,KNN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204892,0.0,0.205038,...,6.0,MAR - 0.01,MAR - 0.01 - 189,-0.000189,0.224708,2.0,0.003392,0.205116,-0.000380,0.095936
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,3.0,MAR - 0.01,MAR - 0.01 - 189,-0.000117,0.224662,1.0,0.003597,0.205116,-0.000452,0.095792
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,4.0,MAR - 0.01,MAR - 0.01 - 189,-0.000131,0.225196,3.0,0.001218,0.205116,-0.000438,0.098379
4,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356,0.225407,5.0,0.000280,0.205116,-0.000925,0.099945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,KNN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,3.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.005164,205.719979,6.0,-0.015257,149.495696,-0.006834,0.385562
1258,Mean/Mode,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,147.017846,0.0,146.311138,...,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.009543,205.088208,4.0,-0.012223,149.495696,-0.021302,0.401727
1259,VAE,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,5.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.010457,205.193612,5.0,-0.012731,149.495696,-0.001522,0.374664
1260,GAIN,42712,MNAR,0.50,humidity,downstream_performance_mean,Regression Tasks,148.397969,0.0,149.936478,...,6.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.014867,203.723620,3.0,-0.005607,149.495696,0.002948,0.358733


In [201]:
data_heatmaps = data.copy()

data_heatmaps['Missing Fraction'] = data_heatmaps['Missing Fraction'].astype(str)
data_heatmaps['Missing Type'] = data_heatmaps['Missing Type'].astype(str)

In [203]:
# Heatmap with differences per Dataconstellation relative to full, perturbed dataset
data_heat = data.copy()
data_heat = data_heat.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = data_heat.loc[data_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference Imputed to Subset in Percent"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.3),
                    zmax=0.3,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("regression_heatmap_f1_score_improvement_Imputed_to_Subset%s.pdf" %i)
    # Positive value indicates, that full, corrupted dataset score is better than the score for the subset

In [204]:
# sorting data by total improvement
df_quantiles = data_heatmaps.copy()

df_quantiles
#df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)
df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (-0.09)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (-0.07)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (-0.05)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (-0.03)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (-0.01)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (0.01)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (0.03)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (0.05)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (0.07)) | (df_quantiles["Performance Difference Imputed to Subset in Percent"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference Imputed to Subset in Percent"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [205]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []

quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Performance Difference Imputed to Subset in Percent': quantiles,
     'Amount': quantile_freq,
    })


[132, 0, 0, 0, 12, 118, 5, 182, 2, 14, 797]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [206]:
fig = px.bar(improvement_quantiles, x='Performance Difference Imputed to Subset in Percent', y='Amount')
fig.show()
fig.write_image("regression_performance_difference_imputed_to_subset.pdf")


In [207]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []


for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[24, 0, 0, 0, 3, 21, 1, 35, 0, 4, 140]
[24, 0, 0, 0, 3, 21, 1, 35, 1, 3, 140]
[24, 0, 0, 0, 2, 22, 1, 36, 0, 2, 141]
[12, 0, 0, 0, 3, 19, 2, 32, 1, 2, 141]
[24, 0, 0, 0, 1, 23, 0, 24, 0, 1, 143]
[24, 0, 0, 0, 0, 12, 0, 20, 0, 2, 92]


In [208]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("regression_performance_difference_imputed_to_subset_per_method.pdf")

In [209]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

#print(df_10.info())

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
               
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]

        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[33, 0, 0, 0, 0, 32, 0, 46, 0, 0, 203]
[33, 0, 0, 0, 0, 33, 0, 48, 0, 0, 204]
[33, 0, 0, 0, 5, 27, 3, 45, 1, 5, 197]
[33, 0, 0, 0, 7, 26, 2, 43, 1, 9, 193]


In [210]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("regression_performance_difference_imputed_to_subset_per_frac.pdf")

In [211]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']

print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []


for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()

        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 
                                     
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[44, 0, 0, 0, 1, 42, 0, 63, 0, 0, 270]
[44, 0, 0, 0, 3, 40, 2, 61, 0, 6, 266]
[44, 0, 0, 0, 8, 36, 3, 58, 2, 8, 261]


In [212]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("regression_performance_difference_imputed_to_subset_per_patt.pdf")

## Comparison based on Rank

In [213]:
#data_heatmaps
subset_rank_acc = data_heatmaps.copy()
subset_rank_acc


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline_x,Corrupted,Imputed,...,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent,Imputed_Subset,Downstream Performance Rank Subset,Performance Difference to Average Best Subset in Percent,Baseline,Performance Difference Baseline to Imputed,Performance Difference Imputed to Subset in Percent
0,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000000,0.225470,6.0,0.000000,0.205116,-0.000569,0.099861
1,KNN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204892,0.0,0.205038,...,6.0,MAR - 0.01,MAR - 0.01 - 189,-0.000189,0.224708,2.0,0.003392,0.205116,-0.000380,0.095936
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,3.0,MAR - 0.01,MAR - 0.01 - 189,-0.000117,0.224662,1.0,0.003597,0.205116,-0.000452,0.095792
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,4.0,MAR - 0.01,MAR - 0.01 - 189,-0.000131,0.225196,3.0,0.001218,0.205116,-0.000438,0.098379
4,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356,0.225407,5.0,0.000280,0.205116,-0.000925,0.099945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,KNN,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,3.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.005164,205.719979,6.0,-0.015257,149.495696,-0.006834,0.385562
1258,Mean/Mode,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,147.017846,0.0,146.311138,...,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.009543,205.088208,4.0,-0.012223,149.495696,-0.021302,0.401727
1259,VAE,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,5.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.010457,205.193612,5.0,-0.012731,149.495696,-0.001522,0.374664
1260,GAIN,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.397969,0.0,149.936478,...,6.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.014867,203.723620,3.0,-0.005607,149.495696,0.002948,0.358733


In [214]:
subset_rank_acc = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank'] == 1.0]
print(len(subset_rank_acc))
subset_rank_acc_right = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank Subset'] == 1.0]
print(len(subset_rank_acc_right))
#subset_rank_acc_right.to_csv('subset_rank_acc_right.csv')
subset_rank_acc_wrong = subset_rank_acc.loc[subset_rank_acc['Downstream Performance Rank Subset'] != 1.0]
print(len(subset_rank_acc_wrong))
#subset_rank_acc_wrong.to_csv('subset_rank_acc_wrong.csv')

217
56
161


In [215]:
print(len(subset_rank_acc))
subset_rank_acc_mean_diff = subset_rank_acc['Performance Difference to Average Best Subset in Percent'].mean()
print(subset_rank_acc_mean_diff)

217
0.00024710999548348614


In [216]:
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []

for i in methods:
    df_temp = subset_rank_acc_right.loc[subset_rank_acc_right['Imputation_Method'] == i]
    df_temp_len = len(df_temp.index)
    if (i == 'Random Forest'):
        forest_freq.append(df_temp_len)
    elif (i == 'KNN'):
        knn_freq.append(df_temp_len)                                       
    elif (i == 'Mean/Mode'):
        mode_freq.append(df_temp_len)                                                 
    elif (i == 'Discriminative DL'):
        dl_freq.append(df_temp_len)                                       
    elif (i == 'VAE'):
        vae_freq.append(df_temp_len)                                         
    elif (i == 'GAIN'):
        gain_freq.append(df_temp_len)                                          
print("Subset with same predicition as full, corrupted")
print(forest_freq, 'Random Forest')
print(knn_freq, 'KNN')
print(mode_freq, 'Mode')
print(dl_freq, 'DL')
print(vae_freq, 'VAE')
print(gain_freq, 'GAIN')

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
Subset with same predicition as full, corrupted
[20] Random Forest
[3] KNN
[14] Mode
[7] DL
[6] VAE
[6] GAIN


In [217]:
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
#methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []

for i in methods:
    df_temp = subset_rank_acc_wrong.loc[subset_rank_acc_wrong['Imputation_Method'] == i]
    df_temp_len = len(df_temp.index)
    if (i == 'Random Forest'):
        forest_freq.append(df_temp_len)
    elif (i == 'KNN'):
        knn_freq.append(df_temp_len)                                       
    elif (i == 'Mean/Mode'):
        mode_freq.append(df_temp_len)                                                 
    elif (i == 'Discriminative DL'):
        dl_freq.append(df_temp_len)                                       
    elif (i == 'VAE'):
        vae_freq.append(df_temp_len)                                         
    elif (i == 'GAIN'):
        gain_freq.append(df_temp_len)                                          

print("Subset with different predicition than full, corrupted")
print(forest_freq, 'Random Forest')
print(knn_freq, 'KNN')
print(mode_freq, 'Mode')
print(dl_freq, 'DL')
print(vae_freq, 'VAE')
print(gain_freq, 'GAIN')

['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
Subset with different predicition than full, corrupted
[28] Random Forest
[21] KNN
[36] Mode
[29] DL
[23] VAE
[24] GAIN
