# Visualize Results: Downstream Performance - Regression Corrupted Experiments

[Set Average Best Imputation Method Manually](#Set-Average-Best-Imputation-Method-Manually)

This notebook should answer the questions: *Does imputation lead to better downstream performances?*

Data needs to be preprocessed with other notebook, her we only import two csv files with raw data regarding the results of the experiment and information about the used datasets!


In [53]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns
from pandas.api.types import CategoricalDtype
from pathlib import Path

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr


%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Settings

In [54]:
sns.set(style="whitegrid")
sns.set_context('paper', font_scale=1.5)
mpl.rcParams['lines.linewidth'] = '2'

In [55]:
CLF_METRIC = "Classification Tasks"
REG_METRIC = "Regression Tasks"

DOWNSTREAM_RESULT_TYPE = "downstream_performance_mean"
IMPUTE_RESULT_TYPE = "impute_performance_mean"


## Data Preparation

In [56]:
# import preprocessed data from experiments
results = pd.read_csv('../regression_corrupted.csv')
results

Unnamed: 0,experiment,imputer,task,missing_type,missing_fraction,strategy,column,result_type,metric,train,test,baseline,corrupted,imputed
0,corrupted_regression_experiment,AutoKerasImputer,1193,MAR,0.01,single_single,UI,impute_performance_std,F1_micro,0.018478,0.016129,,,
1,corrupted_regression_experiment,AutoKerasImputer,1193,MAR,0.01,single_single,UI,impute_performance_std,F1_macro,0.005412,0.104607,,,
2,corrupted_regression_experiment,AutoKerasImputer,1193,MAR,0.01,single_single,UI,impute_performance_std,F1_weighted,0.022187,0.039366,,,
3,corrupted_regression_experiment,AutoKerasImputer,1193,MAR,0.10,single_single,UI,impute_performance_std,F1_micro,0.005025,0.013673,,,
4,corrupted_regression_experiment,AutoKerasImputer,1193,MAR,0.10,single_single,UI,impute_performance_std,F1_macro,0.004450,0.008727,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16135,corrupted_regression_experiment,VAEImputer,42712,MNAR,0.30,single_single,humidity,downstream_performance_mean,MSE,,,22884.632072,0.0,23006.528894
16136,corrupted_regression_experiment,VAEImputer,42712,MNAR,0.30,single_single,humidity,downstream_performance_mean,RMSE,,,151.276674,0.0,151.678701
16137,corrupted_regression_experiment,VAEImputer,42712,MNAR,0.50,single_single,humidity,downstream_performance_mean,MAE,,,104.177676,0.0,103.074606
16138,corrupted_regression_experiment,VAEImputer,42712,MNAR,0.50,single_single,humidity,downstream_performance_mean,MSE,,,22505.093207,0.0,22281.280851


In [57]:
# Filtering the relevant data for downstream analysis

na_impute_results = results[
    (results["result_type"] == IMPUTE_RESULT_TYPE) & 
    (results["metric"].isin(["F1_macro", "RMSE"]))
]
na_impute_results.drop(["baseline", "corrupted", "imputed"], axis=1, inplace=True)
na_impute_results = na_impute_results[na_impute_results.isna().any(axis=1)]
na_impute_results.shape



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(3, 11)

In [58]:
# check if strategy type is correct!
STRATEGY_TYPE = "single_single"

downstream_results = results[
    (results["result_type"] == DOWNSTREAM_RESULT_TYPE) & 
    (results["metric"].isin(["F1_macro", "RMSE"]) &
    (results["strategy"] == STRATEGY_TYPE))
]

# remove experiments where imputation failed
downstream_results = downstream_results.merge(
    na_impute_results,
    how = "left",
    validate = "one_to_one",
    indicator = True,
    suffixes=("", "_imp"),
    on = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "column"]
)
downstream_results = downstream_results[downstream_results["_merge"]=="left_only"]

assert len(results["strategy"].unique()) == 1
downstream_results.drop(["experiment", "strategy", "result_type_imp", "metric_imp", "train", "test", "train_imp", "test_imp", "_merge"], axis=1, inplace=True)

downstream_results = downstream_results.rename(
    {
        "imputer": "Imputation_Method",
        "task": "Task",
        "missing_type": "Missing Type",
        "missing_fraction": "Missing Fraction",
        "column": "Column",
        "baseline": "Baseline",
        "imputed": "Imputed",
        "corrupted": "Corrupted"
    },
    axis = 1
)

In [59]:
rename_imputer_dict = {
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "KNN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
}

rename_metric_dict = {
    "F1_macro": CLF_METRIC,
    "RMSE": REG_METRIC
}

downstream_results = downstream_results.replace(rename_imputer_dict)
downstream_results = downstream_results.replace(rename_metric_dict)

downstream_results

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed
0,Discriminative DL,1193,MAR,0.01,UI,downstream_performance_mean,Regression Tasks,949.139559,0.0,948.994662
1,Discriminative DL,1193,MAR,0.10,UI,downstream_performance_mean,Regression Tasks,945.054395,0.0,945.582069
2,Discriminative DL,1193,MAR,0.30,UI,downstream_performance_mean,Regression Tasks,937.919925,0.0,935.517056
3,Discriminative DL,1193,MAR,0.50,UI,downstream_performance_mean,Regression Tasks,928.297444,0.0,928.683026
4,Discriminative DL,1193,MCAR,0.01,UI,downstream_performance_mean,Regression Tasks,949.015231,0.0,948.979379
...,...,...,...,...,...,...,...,...,...,...
1340,VAE,42712,MCAR,0.50,humidity,downstream_performance_mean,Regression Tasks,152.144893,0.0,152.251677
1341,VAE,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.535129,0.0,149.664127
1342,VAE,42712,MNAR,0.10,humidity,downstream_performance_mean,Regression Tasks,150.072266,0.0,149.894417
1343,VAE,42712,MNAR,0.30,humidity,downstream_performance_mean,Regression Tasks,151.276674,0.0,151.678701


### Robustness: check which imputers yielded `NaN`values

In [60]:
for col in downstream_results.columns:
    na_sum = downstream_results[col].isna().sum()
    if na_sum > 0:
        print("-----" * 10)        
        print(col, na_sum)
        print("-----" * 10)        
        na_idx = downstream_results[col].isna()
        print(downstream_results.loc[na_idx, "Imputation Method"].value_counts(dropna=False))
        print("\n")

## Adding Dataset Info, Sorting and Ranking

In [61]:
clf_row_idx = downstream_results["metric"] == CLF_METRIC
reg_row_idx = downstream_results["metric"] == REG_METRIC

# Sorting of data

#adjust order to fit the processing time -> fastest first
methods_order = CategoricalDtype(['Mean/Mode', 'KNN', 'Random Forest', 'VAE', 'GAIN', 'Discriminative DL'], ordered=True)
downstream_results_full_sort = downstream_results.copy()

downstream_results_full_sort['Imputation_Method'] = downstream_results_full_sort['Imputation_Method'].astype(methods_order)
downstream_results_full_sort = downstream_results_full_sort.sort_values(['Task', 'Missing Type',
                                                                         'Missing Fraction', 'Imputed','Imputation_Method'], ascending=[True, True, True, True, True])


downstream_results.head()

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed
0,Discriminative DL,1193,MAR,0.01,UI,downstream_performance_mean,Regression Tasks,949.139559,0.0,948.994662
1,Discriminative DL,1193,MAR,0.1,UI,downstream_performance_mean,Regression Tasks,945.054395,0.0,945.582069
2,Discriminative DL,1193,MAR,0.3,UI,downstream_performance_mean,Regression Tasks,937.919925,0.0,935.517056
3,Discriminative DL,1193,MAR,0.5,UI,downstream_performance_mean,Regression Tasks,928.297444,0.0,928.683026
4,Discriminative DL,1193,MCAR,0.01,UI,downstream_performance_mean,Regression Tasks,949.015231,0.0,948.979379


In [62]:
#downstream_results_full_sort.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342 entries, 494 to 660
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Imputation_Method  1342 non-null   category
 1   Task               1342 non-null   int64   
 2   Missing Type       1342 non-null   object  
 3   Missing Fraction   1342 non-null   float64 
 4   Column             1342 non-null   object  
 5   result_type        1342 non-null   object  
 6   metric             1342 non-null   object  
 7   Baseline           1342 non-null   float64 
 8   Corrupted          1342 non-null   float64 
 9   Imputed            1342 non-null   float64 
dtypes: category(1), float64(4), int64(1), object(4)
memory usage: 106.4+ KB


In [63]:
# add dataset information from other csv file
dataset_info = pd.read_csv('../datasets_information_overview.csv')
dataset_info = dataset_info.rename(columns={"did": "Task"})


downstream_results_full_sort = pd.merge(downstream_results_full_sort, dataset_info, on='Task')
downstream_results_full_sort.head()

Unnamed: 0.1,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,67,kin8nm,,,9.0,8192.0,9.0,0.0,
1,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,67,kin8nm,,,9.0,8192.0,9.0,0.0,
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,67,kin8nm,,,9.0,8192.0,9.0,0.0,
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,67,kin8nm,,,9.0,8192.0,9.0,0.0,
4,Discriminative DL,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205028,67,kin8nm,,,9.0,8192.0,9.0,0.0,


In [64]:
# Ranking of downstream performance per data constellation for every imputation method

EXPERIMENTAL_CONDITIONS = ["Task", "Missing Type", "Missing Fraction", "Column", "result_type"]

downstream_results_rank = downstream_results_full_sort.copy()

downstream_results_rank["Downstream Performance Rank"] = downstream_results_rank.groupby(EXPERIMENTAL_CONDITIONS).rank(ascending=True, na_option="bottom", method="first")["Imputed"]
# create csv for detailled checks
downstream_results_rank.to_csv('downstream_results_complete_overview.csv')
downstream_results_rank.head()


Unnamed: 0.1,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,67,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0
1,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,67,kin8nm,,,9.0,8192.0,9.0,0.0,,2.0
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,67,kin8nm,,,9.0,8192.0,9.0,0.0,,3.0
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,67,kin8nm,,,9.0,8192.0,9.0,0.0,,4.0
4,Discriminative DL,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205028,67,kin8nm,,,9.0,8192.0,9.0,0.0,,5.0


In [65]:
# Adjust column type for Imputation_Method
downstream_results_rank['Imputation_Method'] = downstream_results_rank['Imputation_Method'].astype('object')

#downstream_results_rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342 entries, 0 to 1341
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Imputation_Method            1342 non-null   object 
 1   Task                         1342 non-null   int64  
 2   Missing Type                 1342 non-null   object 
 3   Missing Fraction             1342 non-null   float64
 4   Column                       1342 non-null   object 
 5   result_type                  1342 non-null   object 
 6   metric                       1342 non-null   object 
 7   Baseline                     1342 non-null   float64
 8   Corrupted                    1342 non-null   float64
 9   Imputed                      1342 non-null   float64
 10  Unnamed: 0                   1342 non-null   int64  
 11  name                         1342 non-null   object 
 12  MajorityClassSize            0 non-null      float64
 13  MinorityClassSize 

In [66]:
# Merge the two columns "Missing Type" and "Missing Fraction"

downstream_results_rank['Missing Type'] = downstream_results_rank['Missing Type'].astype(str)
downstream_results_rank['Missing Fraction'] = downstream_results_rank['Missing Fraction'].astype(str)
#datatype_new = downstream_results_rank.dtypes

downstream_results_rank['Data_Constellation'] = downstream_results_rank['Missing Type'] + ' - ' + downstream_results_rank['Missing Fraction']
#downstream_results_rank.to_csv('downstream_results_rank_temp.csv')
downstream_results_rank_heatmap2 = downstream_results_rank.copy()
downstream_results_rank.head()


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01
1,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,kin8nm,,,9.0,8192.0,9.0,0.0,,2.0,MAR - 0.01
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,kin8nm,,,9.0,8192.0,9.0,0.0,,3.0,MAR - 0.01
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,kin8nm,,,9.0,8192.0,9.0,0.0,,4.0,MAR - 0.01
4,Discriminative DL,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205028,...,kin8nm,,,9.0,8192.0,9.0,0.0,,5.0,MAR - 0.01


## Analyzing Performance based on Rank per Data Constellation

In [67]:
data = downstream_results_rank.copy()

# Count amount of different Data constellations in column "Data_Constellation"
dc_unique = data.Data_Constellation.unique().size
print(dc_unique, "Data Constellations")
print("_____________________")
# Count amount of 1.0 Ranking result in column "Downstream Performance Rank" (Numbers must match)
rank_count = data['Downstream Performance Rank'].value_counts()
print(rank_count)
print("_____________________")
# Filter for 1.0 Ranking -> Overview -> save as csv
rank_1 = data.loc[data['Downstream Performance Rank'] == 1.0]
rank_1.to_csv('rank_1.csv')

print("_____________________")
# Count how often each Imputation Method is present -> most "wins"
rank_wins = rank_1['Imputation_Method'].value_counts()
print(rank_wins)
print("_____________________")
# Take initial overview and filter for each imputation method and calculate average rank
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
for i in methods:
    df_average_rank = data.loc[data['Imputation_Method'] == i]
    len_ar = len(df_average_rank)
    print(len_ar, "Amount of results available")
    rank_pos = df_average_rank['Downstream Performance Rank'].value_counts().sort_index(ascending=True)
    print(rank_pos)
    average_rank = df_average_rank["Downstream Performance Rank"].mean()
    print("Average Rank for", i, "is", average_rank)





12 Data Constellations
_____________________
1.0    228
2.0    228
3.0    228
4.0    228
5.0    228
6.0    202
Name: Downstream Performance Rank, dtype: int64
_____________________
_____________________
Mean/Mode            50
Random Forest        48
GAIN                 38
Discriminative DL    37
VAE                  31
KNN                  24
Name: Imputation_Method, dtype: int64
_____________________
228 Amount of results available
1.0    48
2.0    49
3.0    40
4.0    33
5.0    38
6.0    20
Name: Downstream Performance Rank, dtype: int64
Average Rank for Random Forest is 3.1052631578947367
_____________________
228 Amount of results available
1.0    24
2.0    38
3.0    71
4.0    51
5.0    28
6.0    16
Name: Downstream Performance Rank, dtype: int64
Average Rank for KNN is 3.3026315789473686
_____________________
228 Amount of results available
1.0    50
2.0    37
3.0    26
4.0    49
5.0    41
6.0    25
Name: Downstream Performance Rank, dtype: int64
Average Rank for Mean/Mode is 3.3

In [68]:
rank_1_backup = rank_1.copy()
rank_1

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01
6,Discriminative DL,189,MAR,0.1,theta8,downstream_performance_mean,Regression Tasks,0.204813,0.0,0.204953,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.1
12,Mean/Mode,189,MAR,0.3,theta8,downstream_performance_mean,Regression Tasks,0.204820,0.0,0.205069,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.3
18,Discriminative DL,189,MAR,0.5,theta8,downstream_performance_mean,Regression Tasks,0.205841,0.0,0.205878,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.5
24,Discriminative DL,189,MCAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.205083,0.0,0.204972,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MCAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,Random Forest,42712,MCAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.991778,0.0,148.880924,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MCAR - 0.5
1318,Discriminative DL,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.507994,0.0,149.475688,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.01
1324,Discriminative DL,42712,MNAR,0.1,humidity,downstream_performance_mean,Regression Tasks,149.523046,0.0,149.372273,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.1
1330,Discriminative DL,42712,MNAR,0.3,humidity,downstream_performance_mean,Regression Tasks,149.431371,0.0,149.346814,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.3


## Set Average Best Imputation Method Manually

In [69]:
# SET AVERAGE BEST IMPUTATION METHOD HERE, BASED ON THE PREVIOUS RESULTS
# Alternatively you can define a baseline method here, which will be used instead, depending on your analysis goals

AVERAGE_BEST_IMPUTATION_METHOD = "Random Forest"

## Differences in Performance Relative to Average Best Imputation Method

In [70]:
av_best = data.loc[data['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
av_best['Task'] = av_best['Task'].astype(str)
av_best['Data_Constellation'] = av_best['Data_Constellation'] + ' - ' + av_best['Task']

av_best = av_best[['Imputation_Method', 'Imputed', 'Data_Constellation', 'Downstream Performance Rank']]
av_best = av_best.rename(columns={'Imputation_Method':'Imputation_Method_average', 
                               'Imputed':'Imputed_average',
                                 'Downstream Performance Rank':'Downstream Performance Rank Average'})


rank_1['Task'] = rank_1['Task'].astype(str)
rank_1['Data_Constellation'] = rank_1['Data_Constellation'] + ' - ' + rank_1['Task']
rank_1 = rank_1[['Imputation_Method', 'Imputed', 'Data_Constellation', 'Downstream Performance Rank']]
rank_1 = rank_1.rename(columns={'Imputation_Method':'Imputation_Method_best', 
                               'Imputed':'Imputed_best',
                               'Downstream Performance Rank':'Downstream Performance Rank Best'})

performance_difference = pd.merge(av_best, rank_1, on='Data_Constellation')
#performance_difference.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [71]:
# Calculate the difference between the best imputation method for each data constellation to the average best imputation method in F1 score
performance_difference['Performance Difference Best to Average'] = performance_difference['Imputed_best'] - performance_difference['Imputed_average']
Average_Difference = performance_difference['Performance Difference Best to Average'].mean()
print("Average Difference in Improvement from best method to average best method for RMSE", Average_Difference)


Average Difference in Improvement from best method to average best method for RMSE -0.831525330739394


In [72]:
performance_difference.to_csv('performance_difference.csv')
performance_difference.head()

Unnamed: 0,Imputation_Method_average,Imputed_average,Data_Constellation,Downstream Performance Rank Average,Imputation_Method_best,Imputed_best,Downstream Performance Rank Best,Performance Difference Best to Average
0,Random Forest,0.204999,MAR - 0.01 - 189,2.0,GAIN,0.204926,1.0,-7.3e-05
1,Random Forest,0.205008,MAR - 0.1 - 189,3.0,Discriminative DL,0.204953,1.0,-5.5e-05
2,Random Forest,0.205515,MAR - 0.3 - 189,3.0,Mean/Mode,0.205069,1.0,-0.000447
3,Random Forest,0.205979,MAR - 0.5 - 189,4.0,Discriminative DL,0.205878,1.0,-0.000101
4,Random Forest,0.205077,MCAR - 0.01 - 189,5.0,Discriminative DL,0.204972,1.0,-0.000105


## Analysis and Ranking based on RMSE Score

In [73]:
# Relative Difference in Percent -> Best Method to Average Best Method

data = downstream_results_rank.copy()
data['Task'] = data['Task'].astype(str)
data['Data_Constellation_full'] = data['Data_Constellation'] + ' - ' + data['Task']

dc_unique = data.Data_Constellation_full.unique()

data_constellations = dc_unique.tolist()
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
average_best_complete = pd.DataFrame()


for i in data_constellations:
    data_constel = data.loc[data['Data_Constellation_full'] == i]
    best_score = data_constel.loc[data_constel['Downstream Performance Rank'] == 1.0]
    average_best = data_constel.loc[data_constel['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
    best_score_int = best_score.iloc[0]['Imputed']
    average_best_int = average_best.iloc[0]['Imputed']
    calc_result = ((best_score_int - average_best_int)/best_score_int)
    calc_result = abs(calc_result)
    average_best['Performance Difference to Best to Average in Percent'] = calc_result
    average_best_complete = average_best_complete.append(average_best)

average_best_complete



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Best to Average in Percent
1,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,,9.0,8192.0,9.0,0.0,,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000356
8,Random Forest,189,MAR,0.1,theta8,downstream_performance_mean,Regression Tasks,0.204910,0.0,0.205008,...,,9.0,8192.0,9.0,0.0,,3.0,MAR - 0.1,MAR - 0.1 - 189,0.000270
14,Random Forest,189,MAR,0.3,theta8,downstream_performance_mean,Regression Tasks,0.204800,0.0,0.205515,...,,9.0,8192.0,9.0,0.0,,3.0,MAR - 0.3,MAR - 0.3 - 189,0.002178
21,Random Forest,189,MAR,0.5,theta8,downstream_performance_mean,Regression Tasks,0.206002,0.0,0.205979,...,,9.0,8192.0,9.0,0.0,,4.0,MAR - 0.5,MAR - 0.5 - 189,0.000489
28,Random Forest,189,MCAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.205079,0.0,0.205077,...,,9.0,8192.0,9.0,0.0,,5.0,MCAR - 0.01,MCAR - 0.01 - 189,0.000514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,Random Forest,42712,MCAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.991778,0.0,148.880924,...,,13.0,17379.0,9.0,4.0,,1.0,MCAR - 0.5,MCAR - 0.5 - 42712,0.000000
1321,Random Forest,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.513804,0.0,149.548996,...,,13.0,17379.0,9.0,4.0,,4.0,MNAR - 0.01,MNAR - 0.01 - 42712,0.000490
1327,Random Forest,42712,MNAR,0.1,humidity,downstream_performance_mean,Regression Tasks,149.557789,0.0,149.782113,...,,13.0,17379.0,9.0,4.0,,4.0,MNAR - 0.1,MNAR - 0.1 - 42712,0.002744
1332,Random Forest,42712,MNAR,0.3,humidity,downstream_performance_mean,Regression Tasks,149.430117,0.0,150.008179,...,,13.0,17379.0,9.0,4.0,,3.0,MNAR - 0.3,MNAR - 0.3 - 42712,0.004428


In [74]:
#Difference in Percentage
average_difference = average_best_complete['Performance Difference to Best to Average in Percent'].mean()
print(average_difference, "average difference in Percent (Verschlechterung der durchschnittlich besten Methode relativ zu jeweils besten Methode)")

0.002383376610533895 average difference in Percent (Verschlechterung der durchschnittlich besten Methode relativ zu jeweils besten Methode)


## Heatmap to Show Detailled Performance of Each Imputation Method for Each Data Constellation

In [77]:
df_heat = downstream_results_rank.copy()
df_heat.drop(["Missing Type", "Missing Fraction", "Column", "result_type", "metric", "Baseline", "Corrupted", "Unnamed: 0", "Unnamed: 0", "name", "NumberOfClasses", "MajorityClassSize", "MinorityClassSize"], axis=1, inplace=True)
df_heat

Unnamed: 0,Imputation_Method,Task,Imputed,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Downstream Performance Rank,Data_Constellation
0,GAIN,189,0.204926,9.0,8192.0,9.0,0.0,1.0,MAR - 0.01
1,Random Forest,189,0.204999,9.0,8192.0,9.0,0.0,2.0,MAR - 0.01
2,Mean/Mode,189,0.205023,9.0,8192.0,9.0,0.0,3.0,MAR - 0.01
3,VAE,189,0.205026,9.0,8192.0,9.0,0.0,4.0,MAR - 0.01
4,Discriminative DL,189,0.205028,9.0,8192.0,9.0,0.0,5.0,MAR - 0.01
...,...,...,...,...,...,...,...,...,...
1337,Random Forest,42712,147.707377,13.0,17379.0,9.0,4.0,2.0,MNAR - 0.5
1338,KNN,42712,148.474038,13.0,17379.0,9.0,4.0,3.0,MNAR - 0.5
1339,Discriminative DL,42712,148.593887,13.0,17379.0,9.0,4.0,4.0,MNAR - 0.5
1340,VAE,42712,149.268209,13.0,17379.0,9.0,4.0,5.0,MNAR - 0.5


In [78]:
# Heatmap for total RMSE score for each data constellation for each method

df_heat = df_heat.astype({"Task":"string"})

data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MCAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']


for i in data_constellations:
    data_constel = df_heat.loc[df_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Imputed"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'Reds',
                    zmin=0,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()

In [79]:

df_heat_dif = downstream_results_rank_heatmap2.copy()
#df_heat_dif

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01
1,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,kin8nm,,,9.0,8192.0,9.0,0.0,,2.0,MAR - 0.01
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,kin8nm,,,9.0,8192.0,9.0,0.0,,3.0,MAR - 0.01
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,kin8nm,,,9.0,8192.0,9.0,0.0,,4.0,MAR - 0.01
4,Discriminative DL,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205028,...,kin8nm,,,9.0,8192.0,9.0,0.0,,5.0,MAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337,Random Forest,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.151741,0.0,147.707377,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,2.0,MNAR - 0.5
1338,KNN,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,3.0,MNAR - 0.5
1339,Discriminative DL,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,149.191945,0.0,148.593887,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,4.0,MNAR - 0.5
1340,VAE,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,5.0,MNAR - 0.5


In [80]:
# Calculate Difference for every Imputation towards average best Imputation Method per Data Constellation
# Calculation for RMSE Differences in Percentage (RMSE alone not comparable over different datasets)

data = downstream_results_rank.copy()
data['Task'] = data['Task'].astype(str)
data['Data_Constellation_full'] = data['Data_Constellation'] + ' - ' + data['Task']

dc_unique = data.Data_Constellation_full.unique()
#print(dc_unique)

data_constellations = dc_unique.tolist()
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
heatmap_data_difference = pd.DataFrame()


for i in data_constellations:
    data_constel = data.loc[data['Data_Constellation_full'] == i]
    average_best = data_constel.loc[data_constel['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
    dataset_number = best_score.iloc[0]['Task']
    for i in methods:
        if ((data_constel['Imputation_Method'] == i).any()):
            current_score_row = data_constel.loc[data['Imputation_Method'] == i]
            current_score_int = current_score_row.iloc[0]['Imputed']
            average_best_int = average_best.iloc[0]['Imputed']
            calc_result = ((average_best_int - current_score_int)/current_score_int)

            current_score_row['Performance Difference to Average Best in Percent'] = calc_result
            heatmap_data_difference = heatmap_data_difference.append(current_score_row)  
        else:
            print("Imputation Method not here ---------------------")

heatmap_data_difference





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Imputation Method not here ---------------------
Imputation Method not here ---------------------
Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent
1,Random Forest,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204919,0.0,0.204999,...,,9.0,8192.0,9.0,0.0,,2.0,MAR - 0.01,MAR - 0.01 - 189,0.000000
5,KNN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204892,0.0,0.205038,...,,9.0,8192.0,9.0,0.0,,6.0,MAR - 0.01,MAR - 0.01 - 189,-0.000189
2,Mean/Mode,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204909,0.0,0.205023,...,,9.0,8192.0,9.0,0.0,,3.0,MAR - 0.01,MAR - 0.01 - 189,-0.000117
3,VAE,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204895,0.0,0.205026,...,,9.0,8192.0,9.0,0.0,,4.0,MAR - 0.01,MAR - 0.01 - 189,-0.000131
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1338,KNN,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,149.553233,0.0,148.474038,...,,13.0,17379.0,9.0,4.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.005164
1336,Mean/Mode,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,147.017846,0.0,146.311138,...,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.009543
1340,VAE,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,150.016976,0.0,149.268209,...,,13.0,17379.0,9.0,4.0,,5.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.010457
1341,GAIN,42712,MNAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.397969,0.0,149.936478,...,,13.0,17379.0,9.0,4.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42712,-0.014867


In [81]:
# Heatmap for RMSE differences in Percentage for each data constellation for each method relative to average best imputation method

heatmap_data_difference = heatmap_data_difference.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']
for i in data_constellations:
    data_constel = heatmap_data_difference.loc[df_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference to Average Best in Percent"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.1),
                    zmax=0.1,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("regression_heatmap_f1_score_improvement_to_avbest%s.pdf" %i)

In [82]:
#heatmap_data_difference.agg(['min', 'max'])
heatmap_data_difference.to_csv('heatmap_data_difference.csv')
heatmap_data_difference['Performance Difference to Average Best in Percent'].agg(['min', 'max'])

min   -0.220949
max    0.160131
Name: Performance Difference to Average Best in Percent, dtype: float64

In [83]:
heatmap_data_difference
heatmap_data_difference.to_csv('regression_imputed_full_info.csv', index=False)

## Improvment Proportions for All Data Constellations and Methods Relative to Average Best Method

In [84]:
# data preprocessing here
df_quantiles = heatmap_data_difference.copy()
df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)

df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (-0.09)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (-0.07)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (-0.05)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (-0.03)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (-0.01)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (0.01)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (0.03)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (0.05)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (0.07)) | (df_quantiles["Performance Difference to Average Best in Percent"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best in Percent"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [85]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []
#quantile_freq.append()
quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Improvement to Average Best in Percent': quantiles,
     'Amount': quantile_freq,
    })


[10, 6, 8, 20, 46, 1009, 12, 2, 0, 0, 1]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [86]:
fig = px.bar(improvement_quantiles, x='Improvement to Average Best in Percent', y='Amount')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl.pdf")

In [87]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[]
[0, 0, 0, 0, 3, 223, 2, 0, 0, 0, 0]
[0, 1, 3, 7, 9, 205, 2, 1, 0, 0, 0]
[0, 0, 0, 0, 3, 216, 3, 0, 0, 0, 0]
[3, 3, 2, 4, 10, 203, 1, 1, 0, 0, 1]
[7, 2, 3, 9, 21, 162, 4, 0, 0, 0, 0]


In [88]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_per_method.pdf")

In [89]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
#print(fractions)
#print(df_10)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

[0, 0, 0, 0, 0, 267, 0, 0, 0, 0, 0]
[2, 0, 1, 1, 17, 259, 0, 0, 0, 0, 0]
[3, 3, 5, 7, 12, 248, 5, 0, 0, 0, 1]
[5, 3, 2, 12, 17, 235, 7, 2, 0, 0, 0]


In [90]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_per_frac.pdf")

In [91]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)
#print(df_10)

freq_001 = []
freq_01 = []
freq_03 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 

                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[3, 4, 1, 8, 13, 341, 0, 0, 0, 0, 0]
[4, 2, 1, 6, 19, 336, 3, 0, 0, 0, 0]
[3, 0, 6, 6, 14, 332, 9, 2, 0, 0, 1]


In [92]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_per_patt.pdf")

## Improvment Proportions for the Best Imputation Method per Data Constellation Relative to Average Best Method

In [93]:
improv_to_av_bar = heatmap_data_difference.copy()

improv_to_av_bar = improv_to_av_bar.drop(improv_to_av_bar[improv_to_av_bar["Downstream Performance Rank"] != 1.0].index)

df_01 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best in Percent"] <= (-0.01)) | (improv_to_av_bar["Performance Difference to Average Best in Percent"] > (0.01))].index)
df01 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best in Percent"] <= (0.01)) | (improv_to_av_bar["Performance Difference to Average Best in Percent"] > (0.03))].index)
df03 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best in Percent"] <= (0.03)) | (improv_to_av_bar["Performance Difference to Average Best in Percent"] > (0.05))].index)
df05 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best in Percent"] <= (0.05)) | (improv_to_av_bar["Performance Difference to Average Best in Percent"] > (0.07))].index)
df07 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best in Percent"] <= (0.07)) | (improv_to_av_bar["Performance Difference to Average Best in Percent"] > (0.09))].index)
df09 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best in Percent"] <= (0.09))].index)

improv_to_av_bar

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01,MAR - 0.01 - 189,0.000356
6,Discriminative DL,189,MAR,0.1,theta8,downstream_performance_mean,Regression Tasks,0.204813,0.0,0.204953,...,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.1,MAR - 0.1 - 189,0.000270
12,Mean/Mode,189,MAR,0.3,theta8,downstream_performance_mean,Regression Tasks,0.204820,0.0,0.205069,...,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.3,MAR - 0.3 - 189,0.002178
18,Discriminative DL,189,MAR,0.5,theta8,downstream_performance_mean,Regression Tasks,0.205841,0.0,0.205878,...,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.5,MAR - 0.5 - 189,0.000489
24,Discriminative DL,189,MCAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.205083,0.0,0.204972,...,,9.0,8192.0,9.0,0.0,,1.0,MCAR - 0.01,MCAR - 0.01 - 189,0.000514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,Random Forest,42712,MCAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.991778,0.0,148.880924,...,,13.0,17379.0,9.0,4.0,,1.0,MCAR - 0.5,MCAR - 0.5 - 42712,0.000000
1318,Discriminative DL,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.507994,0.0,149.475688,...,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.01,MNAR - 0.01 - 42712,0.000490
1324,Discriminative DL,42712,MNAR,0.1,humidity,downstream_performance_mean,Regression Tasks,149.523046,0.0,149.372273,...,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.1,MNAR - 0.1 - 42712,0.002744
1330,Discriminative DL,42712,MNAR,0.3,humidity,downstream_performance_mean,Regression Tasks,149.431371,0.0,149.346814,...,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.3,MNAR - 0.3 - 42712,0.004428


In [94]:
#improv_to_av_bar.agg(['min', 'max'])

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best in Percent
min,Discriminative DL,1193,MAR,0.01,Material,downstream_performance_mean,Regression Tasks,0.001417,0.0,0.001418,...,,6.0,4477.0,3.0,0.0,,1.0,MAR - 0.01,MAR - 0.01 - 1193,0.0
max,VAE,42712,MNAR,0.5,x6,downstream_performance_mean,Regression Tasks,72725.060283,0.0,72725.053954,...,,22.0,89640.0,22.0,7.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42712,0.160131


In [95]:
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []
quantile_freq.extend((len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Improvement to Average Best in Percent': quantiles,
     'Amount': quantile_freq,
    })

fig = px.bar(improvement_quantiles, x='Improvement to Average Best in Percent', y='Amount')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_only_best.pdf")

[218, 7, 2, 0, 0, 1]
['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [96]:
# split barchart stacks into methods

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
[]
[23, 1, 0, 0, 0, 0]
[47, 2, 1, 0, 0, 0]
[35, 2, 0, 0, 0, 0]
[29, 0, 1, 0, 0, 1]
[36, 2, 0, 0, 0, 0]


In [97]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_only_best_per_method.pdf")

In [98]:
# split barchart stacks into missingness fractions

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[57, 0, 0, 0, 0, 0]
[57, 0, 0, 0, 0, 0]
[53, 3, 0, 0, 0, 1]
[51, 4, 2, 0, 0, 0]


In [99]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_only_best_per_frac.pdf")

In [100]:
# split barchart stacks into missingness fractions

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 

                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[76, 0, 0, 0, 0, 0]
[73, 3, 0, 0, 0, 0]
[69, 4, 2, 0, 0, 1]


In [101]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("improv_rel_to_av_all_DC_no_av_incl_only_best_per_patt.pdf")

## Extract datasets for Automated Imputation Method Selection -> not used for thesis

To Do: Explore the possibility, that the average best method replaces the best method for a data constellation, if the improvement gain for the best method is below 1%

### Potential Features:
Missingess Pattern (Missing Type)  
Missing Fraction (Missing Fraction)  
Datapoints (NumberOfInstances)  
Features in total (NumberOfFeatures)  
Numeric Features (NumberOfNumericFeatures)  
Categorical Features (NumberOfCategoricalFeatures)  
Downstream Task Type -> Classification/Regression (metric)
  
    
      
Label: Best Imputation Method (Imputation_Method)

In [102]:
# Use dataset with only the best method for each data constellation
rank_1_backup.to_csv('rank_1_backup.csv')
rank_1_backup

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation
0,GAIN,189,MAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.204873,0.0,0.204926,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.01
6,Discriminative DL,189,MAR,0.1,theta8,downstream_performance_mean,Regression Tasks,0.204813,0.0,0.204953,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.1
12,Mean/Mode,189,MAR,0.3,theta8,downstream_performance_mean,Regression Tasks,0.204820,0.0,0.205069,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.3
18,Discriminative DL,189,MAR,0.5,theta8,downstream_performance_mean,Regression Tasks,0.205841,0.0,0.205878,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MAR - 0.5
24,Discriminative DL,189,MCAR,0.01,theta8,downstream_performance_mean,Regression Tasks,0.205083,0.0,0.204972,...,kin8nm,,,9.0,8192.0,9.0,0.0,,1.0,MCAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,Random Forest,42712,MCAR,0.5,humidity,downstream_performance_mean,Regression Tasks,148.991778,0.0,148.880924,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MCAR - 0.5
1318,Discriminative DL,42712,MNAR,0.01,humidity,downstream_performance_mean,Regression Tasks,149.507994,0.0,149.475688,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.01
1324,Discriminative DL,42712,MNAR,0.1,humidity,downstream_performance_mean,Regression Tasks,149.523046,0.0,149.372273,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.1
1330,Discriminative DL,42712,MNAR,0.3,humidity,downstream_performance_mean,Regression Tasks,149.431371,0.0,149.346814,...,Bike_Sharing_Demand,,,13.0,17379.0,9.0,4.0,,1.0,MNAR - 0.3


In [103]:
# Dataset for Training 
properties_train_dataset_8 = rank_1_backup.copy()
properties_train_dataset_8 = properties_train_dataset_8[['Imputation_Method','Missing Type','Missing Fraction',
                                                         'NumberOfInstances','NumberOfFeatures','NumberOfNumericFeatures',
                                                         'NumberOfCategoricalFeatures','metric']]

properties_train_dataset_8


Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfFeatures,NumberOfNumericFeatures,NumberOfCategoricalFeatures,metric
0,GAIN,MAR,0.01,8192.0,9.0,9.0,0.0,Regression Tasks
6,Discriminative DL,MAR,0.1,8192.0,9.0,9.0,0.0,Regression Tasks
12,Mean/Mode,MAR,0.3,8192.0,9.0,9.0,0.0,Regression Tasks
18,Discriminative DL,MAR,0.5,8192.0,9.0,9.0,0.0,Regression Tasks
24,Discriminative DL,MCAR,0.01,8192.0,9.0,9.0,0.0,Regression Tasks
...,...,...,...,...,...,...,...,...
1312,Random Forest,MCAR,0.5,17379.0,13.0,9.0,4.0,Regression Tasks
1318,Discriminative DL,MNAR,0.01,17379.0,13.0,9.0,4.0,Regression Tasks
1324,Discriminative DL,MNAR,0.1,17379.0,13.0,9.0,4.0,Regression Tasks
1330,Discriminative DL,MNAR,0.3,17379.0,13.0,9.0,4.0,Regression Tasks


In [104]:
# Dataset for Training 
properties_train_dataset_7 = rank_1_backup.copy()
properties_train_dataset_7 = properties_train_dataset_7[['Imputation_Method','Missing Type','Missing Fraction',
                                                         'NumberOfInstances','NumberOfFeatures','NumberOfNumericFeatures',
                                                         'NumberOfCategoricalFeatures']]

properties_train_dataset_7
properties_train_dataset_7.to_csv('properties_train_dataset_7.csv', index=False)