# Visualize Results: Downstream Performance - Subset Multiclass Classification Corrupted Experiments -> Training and Test identically imputed

Notebook wurde angepasst -> für Tests nutzen!

This notebook should answer the questions: *Does imputation lead to better downstream performances?*

Data needs to be preprocessed with other notebook, her we only import two csv files with raw data regarding the results of the experiment and information about the used datasets!

## Notebook Structure 

* Application Scenario 2 - Downstream Performance  
   * Categorical  Columns (Classification)
   * Numerical Columns (Regression)
   * Heterogenous Columns (Classification and Regression Combined)

In [107]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns
from pandas.api.types import CategoricalDtype
from pathlib import Path

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr


%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Settings

In [108]:
sns.set(style="whitegrid")
sns.set_context('paper', font_scale=1.5)
mpl.rcParams['lines.linewidth'] = '2'

In [109]:
CLF_METRIC = "F1_macro"
REG_METRIC = "RMSE"

DOWNSTREAM_RESULT_TYPE = "downstream_performance_mean"
IMPUTE_RESULT_TYPE = "impute_performance_mean"


## Data Preparation

In [110]:
# import preprocessed data from experiments
results = pd.read_csv('../multiclass_classification_corrupted_subset.csv')
results

Unnamed: 0,experiment,imputer,task,missing_type,missing_fraction,strategy,column,result_type,metric,train,test,baseline,corrupted,imputed
0,corrupted_multi_experiment_subset,AutoKerasImputer,1459,MAR,0.01,single_single,V7,impute_performance_std,MAE,3.892257,2.182568,,,
1,corrupted_multi_experiment_subset,AutoKerasImputer,1459,MAR,0.01,single_single,V7,impute_performance_std,MSE,158.302272,19.773168,,,
2,corrupted_multi_experiment_subset,AutoKerasImputer,1459,MAR,0.01,single_single,V7,impute_performance_std,RMSE,5.835969,1.733020,,,
3,corrupted_multi_experiment_subset,AutoKerasImputer,1459,MAR,0.10,single_single,V7,impute_performance_std,MAE,0.966404,0.773066,,,
4,corrupted_multi_experiment_subset,AutoKerasImputer,1459,MAR,0.10,single_single,V7,impute_performance_std,MSE,14.518167,12.685356,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14659,corrupted_multi_experiment_subset,VAEImputer,6,MNAR,0.30,single_single,x-box,downstream_performance_mean,F1_macro,,,0.635892,0.0,0.637935
14660,corrupted_multi_experiment_subset,VAEImputer,6,MNAR,0.30,single_single,x-box,downstream_performance_mean,F1_weighted,,,0.644876,0.0,0.646095
14661,corrupted_multi_experiment_subset,VAEImputer,6,MNAR,0.50,single_single,x-box,downstream_performance_mean,F1_micro,,,0.635000,0.0,0.630000
14662,corrupted_multi_experiment_subset,VAEImputer,6,MNAR,0.50,single_single,x-box,downstream_performance_mean,F1_macro,,,0.623508,0.0,0.620571


In [111]:
# Filtering the relevant data for downstream analysis

na_impute_results = results[
    (results["result_type"] == IMPUTE_RESULT_TYPE) & 
    (results["metric"].isin(["F1_macro", "RMSE"]))
]
na_impute_results.drop(["baseline", "corrupted", "imputed"], axis=1, inplace=True)
na_impute_results = na_impute_results[na_impute_results.isna().any(axis=1)]
na_impute_results.shape



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(0, 11)

In [112]:
# check if strategy type is correct!
STRATEGY_TYPE = "single_single"

downstream_results = results[
    (results["result_type"] == DOWNSTREAM_RESULT_TYPE) & 
    (results["metric"].isin(["F1_macro", "RMSE"]) &
    (results["strategy"] == STRATEGY_TYPE))
]

# remove experiments where imputation failed
downstream_results = downstream_results.merge(
    na_impute_results,
    how = "left",
    validate = "one_to_one",
    indicator = True,
    suffixes=("", "_imp"),
    on = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "column"]
)
downstream_results = downstream_results[downstream_results["_merge"]=="left_only"]

assert len(results["strategy"].unique()) == 1
downstream_results.drop(["experiment", "strategy", "result_type_imp", "metric_imp", "train", "test", "train_imp", "test_imp", "_merge"], axis=1, inplace=True)

downstream_results = downstream_results.rename(
    {
        "imputer": "Imputation_Method",
        "task": "Task",
        "missing_type": "Missing Type",
        "missing_fraction": "Missing Fraction",
        "column": "Column",
        "baseline": "Baseline",
        "imputed": "Imputed_Subset",
        "corrupted": "Corrupted"
    },
    axis = 1
)

In [113]:
rename_imputer_dict = {
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "KNN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
}

rename_metric_dict = {
    "F1_macro": CLF_METRIC,
    "RMSE": REG_METRIC
}

downstream_results = downstream_results.replace(rename_imputer_dict)
downstream_results = downstream_results.replace(rename_metric_dict)

downstream_results

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset
0,Discriminative DL,1459,MAR,0.01,V7,downstream_performance_mean,F1_macro,0.302957,0.0,0.305246
1,Discriminative DL,1459,MAR,0.10,V7,downstream_performance_mean,F1_macro,0.352866,0.0,0.353808
2,Discriminative DL,1459,MAR,0.30,V7,downstream_performance_mean,F1_macro,0.324542,0.0,0.313457
3,Discriminative DL,1459,MAR,0.50,V7,downstream_performance_mean,F1_macro,0.310136,0.0,0.306423
4,Discriminative DL,1459,MCAR,0.10,V7,downstream_performance_mean,F1_macro,0.297890,0.0,0.299266
...,...,...,...,...,...,...,...,...,...,...
1217,VAE,6,MCAR,0.50,x-box,downstream_performance_mean,F1_macro,0.663049,0.0,0.652059
1218,VAE,6,MNAR,0.01,x-box,downstream_performance_mean,F1_macro,0.637040,0.0,0.635638
1219,VAE,6,MNAR,0.10,x-box,downstream_performance_mean,F1_macro,0.630141,0.0,0.631342
1220,VAE,6,MNAR,0.30,x-box,downstream_performance_mean,F1_macro,0.635892,0.0,0.637935


### Robustness: Check which Imputers Yielded `NaN`Values

In [114]:
for col in downstream_results.columns:
    na_sum = downstream_results[col].isna().sum()
    if na_sum > 0:
        print("-----" * 10)        
        print(col, na_sum)
        print("-----" * 10)        
        na_idx = downstream_results[col].isna()
        print(downstream_results.loc[na_idx, "Imputation Method"].value_counts(dropna=False))
        print("\n")

## Adding Dataset Info, Sorting and Ranking

In [115]:
#downstream_results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1222 entries, 0 to 1221
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Imputation_Method  1222 non-null   object 
 1   Task               1222 non-null   int64  
 2   Missing Type       1222 non-null   object 
 3   Missing Fraction   1222 non-null   float64
 4   Column             1222 non-null   object 
 5   result_type        1222 non-null   object 
 6   metric             1222 non-null   object 
 7   Baseline           1222 non-null   float64
 8   Corrupted          1222 non-null   float64
 9   Imputed_Subset     1222 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 105.0+ KB


In [116]:
# Sorting of data

#adjust order to fit the processing time -> fastest first
methods_order = CategoricalDtype(['Mean/Mode', 'KNN', 'Random Forest', 'VAE', 'GAIN', 'Discriminative DL'], ordered=True)
downstream_results_full_sort = downstream_results.copy()

downstream_results_full_sort['Imputation_Method'] = downstream_results_full_sort['Imputation_Method'].astype(methods_order)
downstream_results_full_sort = downstream_results_full_sort.sort_values(['Task', 'Missing Type',
                                                                         'Missing Fraction', 'Imputed_Subset','Imputation_Method'], ascending=[True, True, True, True, True])
downstream_results_full_sort


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset
190,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.637297,0.0,0.637839
1006,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.638177,0.0,0.639284
394,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640391,0.0,0.640391
598,GAIN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640532,0.0,0.640532
1210,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640228,0.0,0.642368
...,...,...,...,...,...,...,...,...,...,...
177,Discriminative DL,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.206306,0.0,0.204723
993,Mean/Mode,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.212220,0.0,0.212846
789,KNN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.237500,0.0,0.236748
585,GAIN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.257648,0.0,0.254694


In [117]:
#downstream_results_full_sort.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1222 entries, 190 to 1197
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Imputation_Method  1222 non-null   category
 1   Task               1222 non-null   int64   
 2   Missing Type       1222 non-null   object  
 3   Missing Fraction   1222 non-null   float64 
 4   Column             1222 non-null   object  
 5   result_type        1222 non-null   object  
 6   metric             1222 non-null   object  
 7   Baseline           1222 non-null   float64 
 8   Corrupted          1222 non-null   float64 
 9   Imputed_Subset     1222 non-null   float64 
dtypes: category(1), float64(4), int64(1), object(4)
memory usage: 96.9+ KB


In [118]:
# add dataset information from other csv file

dataset_info = pd.read_csv('../datasets_information_overview.csv')
dataset_info = dataset_info.rename(columns={"did": "Task"})

downstream_results_full_sort = pd.merge(downstream_results_full_sort, dataset_info, on='Task')
downstream_results_full_sort.head()

Unnamed: 0.1,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses
0,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.637297,0.0,0.637839,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0
1,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.638177,0.0,0.639284,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0
2,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640391,0.0,0.640391,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0
3,GAIN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640532,0.0,0.640532,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0
4,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640228,0.0,0.642368,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0


In [119]:
# Ranking of downstream performance per data constellation for every imputation method

EXPERIMENTAL_CONDITIONS = ["Task", "Missing Type", "Missing Fraction", "Column", "result_type"]

downstream_results_rank = downstream_results_full_sort.copy()
downstream_results_rank["Downstream Performance Rank Subset"] = downstream_results_rank.groupby(EXPERIMENTAL_CONDITIONS).rank(ascending=False, na_option="bottom", method="first")["Imputed_Subset"]

# create csv for detailled checks
downstream_results_rank.to_csv('downstream_results_multi_complete_overview.csv')
downstream_results_rank.head()


Unnamed: 0.1,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset
0,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.637297,0.0,0.637839,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,6.0
1,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.638177,0.0,0.639284,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,5.0
2,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640391,0.0,0.640391,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,4.0
3,GAIN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640532,0.0,0.640532,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,3.0
4,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640228,0.0,0.642368,59,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,2.0


In [120]:
# Adjust column type for Imputation_Method
downstream_results_rank['Imputation_Method'] = downstream_results_rank['Imputation_Method'].astype('object')

#downstream_results_rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1222 entries, 0 to 1221
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Imputation_Method                   1222 non-null   object 
 1   Task                                1222 non-null   int64  
 2   Missing Type                        1222 non-null   object 
 3   Missing Fraction                    1222 non-null   float64
 4   Column                              1222 non-null   object 
 5   result_type                         1222 non-null   object 
 6   metric                              1222 non-null   object 
 7   Baseline                            1222 non-null   float64
 8   Corrupted                           1222 non-null   float64
 9   Imputed_Subset                      1222 non-null   float64
 10  Unnamed: 0                          1222 non-null   int64  
 11  name                                1222 no

In [121]:
# Merge the two columns "Missing Type" and "Missing Fraction"

downstream_results_rank['Missing Type'] = downstream_results_rank['Missing Type'].astype(str)
downstream_results_rank['Missing Fraction'] = downstream_results_rank['Missing Fraction'].astype(str)
#datatype_new = downstream_results_rank.dtypes

downstream_results_rank['Data_Constellation'] = downstream_results_rank['Missing Type'] + ' - ' + downstream_results_rank['Missing Fraction']
#downstream_results_rank.to_csv('downstream_results_rank_temp.csv')
downstream_results_rank_heatmap2 = downstream_results_rank.copy()
downstream_results_rank.head()


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation
0,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.637297,0.0,0.637839,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,6.0,MAR - 0.01
1,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.638177,0.0,0.639284,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,5.0,MAR - 0.01
2,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640391,0.0,0.640391,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,4.0,MAR - 0.01
3,GAIN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640532,0.0,0.640532,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,3.0,MAR - 0.01
4,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.640228,0.0,0.642368,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,2.0,MAR - 0.01


## Analyzing Performance Based on Rank per Data Constellation

In [122]:
data = downstream_results_rank.copy()

# Count amount of different Data constellations in column "Data_Constellation"
dc_unique = data.Data_Constellation.unique().size
print(dc_unique, "Data Constellations")
print("_____________________")
# Count amount of 1.0 Ranking result in column "Downstream Performance Rank" 
rank_count = data['Downstream Performance Rank Subset'].value_counts()
print(rank_count)
print("_____________________")
# Filter for 1.0 Ranking -> Overview -> save as csv
rank_1 = data.loc[data['Downstream Performance Rank Subset'] == 1.0]
rank_1.to_csv('rank_1.csv')

print("_____________________")
# Count how often each Imputation Method is present -> most "wins"
rank_wins = rank_1['Imputation_Method'].value_counts()
print(rank_wins)
print("_____________________")

# BE AWARE THAT THE AVERAGE RANK DOES NOT CONSIDER MISSING RESULTS, WHICH RESULT IN THE WORST RANK BY DEFAULT
# Take initial overview and filter for each imputation method and calculate average rank and average improvement
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
for i in methods:
    df_average_rank = data.loc[data['Imputation_Method'] == i]
    len_ar = len(df_average_rank)
    print(len_ar, "Amount of results available")
    rank_pos = df_average_rank['Downstream Performance Rank Subset'].value_counts().sort_index(ascending=True)
    print(rank_pos)
    average_rank = df_average_rank["Downstream Performance Rank Subset"].mean()
    print("Average Rank for", i, "is", average_rank)


12 Data Constellations
_____________________
5.0    204
4.0    204
3.0    204
2.0    204
1.0    204
6.0    202
Name: Downstream Performance Rank Subset, dtype: int64
_____________________
_____________________
KNN                  50
Mean/Mode            38
Random Forest        33
VAE                  32
GAIN                 27
Discriminative DL    24
Name: Imputation_Method, dtype: int64
_____________________
204 Amount of results available
1.0    33
2.0    46
3.0    32
4.0    37
5.0    29
6.0    27
Name: Downstream Performance Rank Subset, dtype: int64
Average Rank for Random Forest is 3.3137254901960786
_____________________
204 Amount of results available
1.0    50
2.0    33
3.0    40
4.0    31
5.0    29
6.0    21
Name: Downstream Performance Rank Subset, dtype: int64
Average Rank for KNN is 3.093137254901961
_____________________
204 Amount of results available
1.0    38
2.0    40
3.0    36
4.0    33
5.0    28
6.0    29
Name: Downstream Performance Rank Subset, dtype: int64
Averag

In [123]:
rank_1_backup = rank_1.copy()
rank_1

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation
5,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.645601,0.0,0.645601,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.01
11,VAE,6,MAR,0.1,x-box,downstream_performance_mean,F1_macro,0.638195,0.0,0.640068,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.1
17,VAE,6,MAR,0.3,x-box,downstream_performance_mean,F1_macro,0.641567,0.0,0.643576,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.3
23,KNN,6,MAR,0.5,x-box,downstream_performance_mean,F1_macro,0.640545,0.0,0.645112,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.5
29,VAE,6,MCAR,0.01,x-box,downstream_performance_mean,F1_macro,0.643709,0.0,0.643386,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MCAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,KNN,41671,MCAR,0.5,a9,downstream_performance_mean,F1_macro,0.234976,0.0,0.238633,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MCAR - 0.5
1203,KNN,41671,MNAR,0.01,a9,downstream_performance_mean,F1_macro,0.253214,0.0,0.253214,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.01
1209,VAE,41671,MNAR,0.1,a9,downstream_performance_mean,F1_macro,0.251410,0.0,0.248822,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.1
1215,GAIN,41671,MNAR,0.3,a9,downstream_performance_mean,F1_macro,0.235808,0.0,0.234488,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.3


## Set Average Best Imputation Method Manually

In [124]:
# SET AVERAGE BEST IMPUTATION METHOD HERE, BASED ON THE PREVIOUS RESULTS
# Alternatively you can define a baseline method here, which will be used instead, depending on your analysis goals

AVERAGE_BEST_IMPUTATION_METHOD = "KNN"

## Differences in Performance Relative to Average Best Imputation Method

In [125]:
av_best = data.loc[data['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
av_best['Task'] = av_best['Task'].astype(str)
av_best['Data_Constellation'] = av_best['Data_Constellation'] + ' - ' + av_best['Task']

av_best = av_best[['Imputation_Method', 'Imputed_Subset', 'Data_Constellation', 'Downstream Performance Rank Subset']]
av_best = av_best.rename(columns={'Imputation_Method':'Imputation_Method_average', 
                               'Imputed_Subset':'Imputed_average_Subset',
                                 'Downstream Performance Rank Subset':'Downstream Performance Rank Average Subset'})

rank_1['Task'] = rank_1['Task'].astype(str)
rank_1['Data_Constellation'] = rank_1['Data_Constellation'] + ' - ' + rank_1['Task']
rank_1 = rank_1[['Imputation_Method', 'Imputed_Subset', 'Data_Constellation', 'Downstream Performance Rank Subset']]
rank_1 = rank_1.rename(columns={'Imputation_Method':'Imputation_Method_best', 
                               'Imputed_Subset':'Imputed_best_Subset',
                               'Downstream Performance Rank Subset':'Downstream Performance Rank Best Subset'})

performance_difference = pd.merge(av_best, rank_1, on='Data_Constellation')
#performance_difference.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [126]:
# Calculate the difference between the best imputation method for each data constellation to the average best imputation method in F1 score

performance_difference['Performance Difference Best to Average'] = performance_difference['Imputed_best_Subset'] - performance_difference['Imputed_average_Subset']
Average_Difference = performance_difference['Performance Difference Best to Average'].mean()
print("Average Difference in Improvement from best method to average best method for F1", Average_Difference)


Average Difference in Improvement from best method to average best method for F1 0.025231108452359773


In [127]:
# Improvement by Percentage

performance_difference['Performance Difference Best to Average in Percentage'] = ((performance_difference['Imputed_best_Subset'] - performance_difference['Imputed_average_Subset'])/performance_difference['Imputed_best_Subset'])*100
Average_Difference_per = performance_difference['Performance Difference Best to Average in Percentage'].mean()

print("Based on F1 Score the Average best method is worse than the best method by this percentage", Average_Difference_per)

Based on F1 Score the Average best method is worse than the best method by this percentage 6.083922289347637


In [128]:
#performance_difference.to_csv('performance_difference.csv')
#performance_difference

## Heatmap to Show Detailled Performance of Each Imputation Method for Each Data Constellation

In [133]:
df_heat = downstream_results_rank.copy()
df_heat.drop(["Missing Type", "Missing Fraction", "Column", "result_type", "metric", "Baseline", "Corrupted", "Unnamed: 0", "Unnamed: 0", "name", "NumberOfClasses", "MajorityClassSize", "MinorityClassSize"], axis=1, inplace=True)
df_heat

Unnamed: 0,Imputation_Method,Task,Imputed_Subset,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Downstream Performance Rank Subset,Data_Constellation
0,Discriminative DL,6,0.637839,17.0,20000.0,16.0,1.0,6.0,MAR - 0.01
1,Mean/Mode,6,0.639284,17.0,20000.0,16.0,1.0,5.0,MAR - 0.01
2,Random Forest,6,0.640391,17.0,20000.0,16.0,1.0,4.0,MAR - 0.01
3,GAIN,6,0.640532,17.0,20000.0,16.0,1.0,3.0,MAR - 0.01
4,VAE,6,0.642368,17.0,20000.0,16.0,1.0,2.0,MAR - 0.01
...,...,...,...,...,...,...,...,...,...
1217,Discriminative DL,41671,0.204723,21.0,20000.0,20.0,1.0,5.0,MNAR - 0.5
1218,Mean/Mode,41671,0.212846,21.0,20000.0,20.0,1.0,4.0,MNAR - 0.5
1219,KNN,41671,0.236748,21.0,20000.0,20.0,1.0,3.0,MNAR - 0.5
1220,GAIN,41671,0.254694,21.0,20000.0,20.0,1.0,2.0,MNAR - 0.5


In [134]:
# Heatmap for total F1 score for each data constellation for each method

df_heat = df_heat.astype({"Task":"string"})

data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MCAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']


for i in data_constellations:
    data_constel = df_heat.loc[df_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    F1_Score = data_constel["Imputed_Subset"]
    

    trace = go.Heatmap(
                   z=F1_Score,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'Reds',
                    zmin=0,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()



In [135]:
#downstream_results_rank_heatmap2
df_heat_dif = downstream_results_rank_heatmap2.copy()


In [136]:
# Calculate Difference for every Imputation towards average best Imputation Method per Data Constellation
# Calculation for F1 Score Differences (not Percentage)

data = downstream_results_rank.copy()
data['Task'] = data['Task'].astype(str)
data['Data_Constellation_full'] = data['Data_Constellation'] + ' - ' + data['Task']

dc_unique = data.Data_Constellation_full.unique()
#print(dc_unique)

data_constellations = dc_unique.tolist()
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
heatmap_data_difference = pd.DataFrame()


for i in data_constellations:
    data_constel = data.loc[data['Data_Constellation_full'] == i]
    average_best = data_constel.loc[data_constel['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
    dataset_number = best_score.iloc[0]['Task']
    for i in methods:
        if ((data_constel['Imputation_Method'] == i).any()):
            current_score_row = data_constel.loc[data['Imputation_Method'] == i]
            current_score_int = current_score_row.iloc[0]['Imputed_Subset']
            average_best_int = average_best.iloc[0]['Imputed_Subset']
            calc_result = (current_score_int - average_best_int)
            
            current_score_row['Performance Difference to Average Best'] = calc_result
            heatmap_data_difference = heatmap_data_difference.append(current_score_row)  
        else:
            print("Imputation Method not here ---------------------")

heatmap_data_difference

heatmap_data_difference['Missing Type'] = heatmap_data_difference['Missing Type'].astype(str)
heatmap_data_difference['Missing Fraction'] = heatmap_data_difference['Missing Fraction'].astype(str)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [137]:
# Heatmap for F1 score differences for each data constellation for each method relative to average best imputation method

heatmap_data_difference = heatmap_data_difference.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = heatmap_data_difference.loc[df_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference to Average Best"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.14),
                    zmax=0.14,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("multi_subset_heatmap_f1_score_improvement_to_avbest%s.pdf" %i)

In [138]:
#heatmap_data_difference.agg(['min', 'max'])

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
min,Discriminative DL,1459,MAR,0.01,A2,downstream_performance_mean,F1_macro,0.046267,0.0,0.045987,...,1.0,5.0,3200.0,0.0,1.0,3.0,1.0,MAR - 0.01,MAR - 0.01 - 1459,-0.284693
max,VAE,6,MNAR,0.5,x-box,downstream_performance_mean,F1_macro,1.0,0.0,1.0,...,4335.0,25.0,58000.0,21.0,25.0,102.0,6.0,MNAR - 0.5,MNAR - 0.5 - 6,0.249897


In [139]:
heatmap_data_difference
heatmap_data_difference.to_csv('multi_subset_full_info.csv', index=False)

## Improvment Proportions for All Data Constellations and Methods Relative to Average Best Method

In [140]:
# data preprocessing here
df_quantiles = heatmap_data_difference.copy()
df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)

df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.09)) | (df_quantiles["Performance Difference to Average Best"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.07)) | (df_quantiles["Performance Difference to Average Best"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.05)) | (df_quantiles["Performance Difference to Average Best"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.03)) | (df_quantiles["Performance Difference to Average Best"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.01)) | (df_quantiles["Performance Difference to Average Best"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.01)) | (df_quantiles["Performance Difference to Average Best"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.03)) | (df_quantiles["Performance Difference to Average Best"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.05)) | (df_quantiles["Performance Difference to Average Best"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.07)) | (df_quantiles["Performance Difference to Average Best"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [141]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []
#quantile_freq.append()
quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Improvement to Average Best': quantiles,
     'Amount': quantile_freq,
    })


[63, 26, 41, 71, 141, 432, 112, 56, 39, 13, 24]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [142]:
fig = px.bar(improvement_quantiles, x='Improvement to Average Best', y='Amount')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl.pdf")

In [143]:
# split barchart stacks into methods
#quantile_freq
quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[7, 6, 8, 16, 27, 90, 18, 15, 6, 5, 6]
[]
[11, 4, 7, 13, 29, 90, 21, 11, 11, 1, 6]
[10, 4, 9, 14, 26, 99, 21, 9, 6, 1, 3]
[20, 9, 8, 15, 30, 71, 28, 10, 5, 4, 4]
[15, 3, 9, 13, 29, 82, 24, 11, 11, 2, 5]


In [144]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_per_method.pdf")

In [145]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
#print(fractions)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
      Imputation_Method   Task Missing Type Missing Fraction  \
102                GAIN     26         MCAR              0.1   
114                 VAE     26         MCAR              0.5   
132       Random Forest     26         MNAR              0.3   
134           Mean/Mode     26         MNAR              0.3   
133   Discriminative DL     26         MNAR              0.3   
...                 ...    ...          ...              ...   
1073  Discriminative DL  40685         MNAR              0.5   
1085      Random Forest  41027          MAR              0.1   
1084                VAE  41027          MAR              0.1   
1186                VAE  41671         MCAR              0.3   
1187  Discriminative DL  41671         MCAR              0.3   

                     Column                  result_type    metric  Baseline  \
102                 parents  downstream_performance_mean  F1_macro  0.702857   
114                 parents  downstream_p

In [146]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_frac.pdf")

In [147]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)
#print(df_10)

freq_001 = []
freq_01 = []
freq_03 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[16, 7, 10, 22, 43, 159, 43, 17, 11, 1, 10]
[19, 7, 16, 32, 45, 134, 33, 24, 15, 6, 8]
[28, 12, 15, 17, 53, 139, 36, 15, 13, 6, 6]


In [148]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_per_patt.pdf")

## Improvment Proportions for the Best Imputation Method per Data Constellation Relative to Average Best Method

In [149]:
improv_to_av_bar = heatmap_data_difference.copy()

improv_to_av_bar = improv_to_av_bar.drop(improv_to_av_bar[improv_to_av_bar["Downstream Performance Rank Subset"] != 1.0].index)

df_01 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (-0.01)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.01))].index)
df01 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.01)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.03))].index)
df03 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.03)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.05))].index)
df05 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.05)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.07))].index)
df07 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.07)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.09))].index)
df09 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.09))].index)

improv_to_av_bar

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
5,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.645601,0.0,0.645601,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000000
11,VAE,6,MAR,0.1,x-box,downstream_performance_mean,F1_macro,0.638195,0.0,0.640068,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.1,MAR - 0.1 - 6,0.010871
17,VAE,6,MAR,0.3,x-box,downstream_performance_mean,F1_macro,0.641567,0.0,0.643576,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.3,MAR - 0.3 - 6,0.014748
23,KNN,6,MAR,0.5,x-box,downstream_performance_mean,F1_macro,0.640545,0.0,0.645112,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.5,MAR - 0.5 - 6,0.000000
29,VAE,6,MCAR,0.01,x-box,downstream_performance_mean,F1_macro,0.643709,0.0,0.643386,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MCAR - 0.01,MCAR - 0.01 - 6,0.005612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,KNN,41671,MCAR,0.5,a9,downstream_performance_mean,F1_macro,0.234976,0.0,0.238633,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MCAR - 0.5,MCAR - 0.5 - 41671,0.000000
1203,KNN,41671,MNAR,0.01,a9,downstream_performance_mean,F1_macro,0.253214,0.0,0.253214,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.01,MNAR - 0.01 - 41671,0.000000
1209,VAE,41671,MNAR,0.1,a9,downstream_performance_mean,F1_macro,0.251410,0.0,0.248822,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.1,MNAR - 0.1 - 41671,0.032247
1215,GAIN,41671,MNAR,0.3,a9,downstream_performance_mean,F1_macro,0.235808,0.0,0.234488,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.3,MNAR - 0.3 - 41671,0.001715


In [150]:
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []
quantile_freq.extend((len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Improvement to Average Best': quantiles,
     'Amount': quantile_freq,
    })

fig = px.bar(improvement_quantiles, x='Improvement to Average Best', y='Amount')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best.pdf")

[103, 42, 24, 18, 6, 11]
['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [151]:
# split barchart stacks into methods

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[12, 6, 6, 3, 3, 3]
[]
[14, 8, 5, 8, 1, 2]
[12, 7, 3, 2, 0, 0]
[9, 13, 5, 1, 1, 3]
[6, 8, 5, 4, 1, 3]


In [152]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best_per_method.pdf")

In [153]:
# split barchart stacks into missingness fractions

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[37, 5, 3, 2, 1, 3]
[25, 10, 8, 3, 1, 4]
[18, 14, 10, 5, 2, 2]
[23, 13, 3, 8, 2, 2]


In [154]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best_per_frac.pdf")

In [155]:
# split barchart stacks into missingness fractions

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                                                     
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[31, 17, 9, 5, 1, 5]
[36, 9, 10, 7, 3, 3]
[36, 16, 5, 6, 2, 3]


In [156]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best_per_patt.pdf")

## Extract datasets for Automated Imputation Method Selection -> not used in this thesis

To Do: Explore the possibility, that the average best method replaces the best method for a data constellation, if the improvement gain for the best method is below 1%

### Potential Features:
Missingess Pattern (Missing Type)  
Missing Fraction (Missing Fraction)  
Datapoints (NumberOfInstances)  
Features in total (NumberOfFeatures)  
Numeric Features (NumberOfNumericFeatures)  
Categorical Features (NumberOfCategoricalFeatures)  
Downstream Task Type -> Classification/Regression (metric)
  
    
      
Label: Best Imputation Method (Imputation_Method)

In [157]:
# Use dataset with only the best method for each data constellation
rank_1_backup.to_csv('rank_1_backup.csv')
rank_1_backup

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation
5,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.645601,0.0,0.645601,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.01
11,VAE,6,MAR,0.1,x-box,downstream_performance_mean,F1_macro,0.638195,0.0,0.640068,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.1
17,VAE,6,MAR,0.3,x-box,downstream_performance_mean,F1_macro,0.641567,0.0,0.643576,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.3
23,KNN,6,MAR,0.5,x-box,downstream_performance_mean,F1_macro,0.640545,0.0,0.645112,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.5
29,VAE,6,MCAR,0.01,x-box,downstream_performance_mean,F1_macro,0.643709,0.0,0.643386,...,letter,813.0,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MCAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,KNN,41671,MCAR,0.5,a9,downstream_performance_mean,F1_macro,0.234976,0.0,0.238633,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MCAR - 0.5
1203,KNN,41671,MNAR,0.01,a9,downstream_performance_mean,F1_macro,0.253214,0.0,0.253214,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.01
1209,VAE,41671,MNAR,0.1,a9,downstream_performance_mean,F1_macro,0.251410,0.0,0.248822,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.1
1215,GAIN,41671,MNAR,0.3,a9,downstream_performance_mean,F1_macro,0.235808,0.0,0.234488,...,microaggregation2,11162.0,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.3


In [158]:
# Dataset for Training 
properties_train_dataset_8 = rank_1_backup.copy()
properties_train_dataset_8 = properties_train_dataset_8[['Imputation_Method','Missing Type','Missing Fraction',
                                                         'NumberOfInstances','NumberOfFeatures','NumberOfNumericFeatures',
                                                         'NumberOfCategoricalFeatures','metric']]

properties_train_dataset_8


Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfFeatures,NumberOfNumericFeatures,NumberOfCategoricalFeatures,metric
5,KNN,MAR,0.01,20000.0,17.0,16.0,1.0,F1_macro
11,VAE,MAR,0.1,20000.0,17.0,16.0,1.0,F1_macro
17,VAE,MAR,0.3,20000.0,17.0,16.0,1.0,F1_macro
23,KNN,MAR,0.5,20000.0,17.0,16.0,1.0,F1_macro
29,VAE,MCAR,0.01,20000.0,17.0,16.0,1.0,F1_macro
...,...,...,...,...,...,...,...,...
1197,KNN,MCAR,0.5,20000.0,21.0,20.0,1.0,F1_macro
1203,KNN,MNAR,0.01,20000.0,21.0,20.0,1.0,F1_macro
1209,VAE,MNAR,0.1,20000.0,21.0,20.0,1.0,F1_macro
1215,GAIN,MNAR,0.3,20000.0,21.0,20.0,1.0,F1_macro


In [159]:
# Dataset for Training 
properties_train_dataset_7 = rank_1_backup.copy()
properties_train_dataset_7 = properties_train_dataset_7[['Imputation_Method','Missing Type','Missing Fraction',
                                                         'NumberOfInstances','NumberOfFeatures','NumberOfNumericFeatures',
                                                         'NumberOfCategoricalFeatures']]

properties_train_dataset_7
properties_train_dataset_7.to_csv('multi_properties_train_dataset_7.csv', index=False)