# Visualize Results: Downstream Performance - Subset Binary Classification Corrupted Experiments -> Training and Test identically imputed


[Set Average Best Imputation Method Manually](#Set-Average-Best-Imputation-Method-Manually)


This notebook should answer the questions: *Does imputation lead to better downstream performances?*

Data needs to be preprocessed with other notebook, her we only import two csv files with raw data regarding the results of the experiment and information about the used datasets!

## Notebook Structure 

* Application Scenario 2 - Downstream Performance  
   * Categorical  Columns (Classification)
   * Numerical Columns (Regression)
   * Heterogenous Columns (Classification and Regression Combined)

In [105]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns
from pandas.api.types import CategoricalDtype
from pathlib import Path

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr


%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Settings

In [106]:
sns.set(style="whitegrid")
sns.set_context('paper', font_scale=1.5)
mpl.rcParams['lines.linewidth'] = '2'

In [107]:
CLF_METRIC = "Classification Tasks"
REG_METRIC = "Regression Tasks"

DOWNSTREAM_RESULT_TYPE = "downstream_performance_mean"
IMPUTE_RESULT_TYPE = "impute_performance_mean"


## Data Preparation

In [108]:
# import preprocessed data from experiments
results = pd.read_csv('../binary_classification_corrupted_subset.csv')
results

Unnamed: 0,experiment,imputer,task,missing_type,missing_fraction,strategy,column,result_type,metric,train,test,baseline,corrupted,imputed
0,corrupted_binary_experiment_subset,AutoKerasImputer,1046,MAR,0.01,single_single,end,impute_performance_std,MAE,1.137442e+05,3.887922e+04,,,
1,corrupted_binary_experiment_subset,AutoKerasImputer,1046,MAR,0.01,single_single,end,impute_performance_std,MSE,5.654021e+10,2.603024e+10,,,
2,corrupted_binary_experiment_subset,AutoKerasImputer,1046,MAR,0.01,single_single,end,impute_performance_std,RMSE,1.080402e+05,6.591475e+04,,,
3,corrupted_binary_experiment_subset,AutoKerasImputer,1046,MAR,0.10,single_single,end,impute_performance_std,MAE,1.137216e+05,6.174183e+04,,,
4,corrupted_binary_experiment_subset,AutoKerasImputer,1046,MAR,0.10,single_single,end,impute_performance_std,MSE,8.905253e+10,1.204022e+11,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26659,corrupted_binary_experiment_subset,VAEImputer,923,MNAR,0.30,single_single,isns,downstream_performance_mean,F1_macro,,,0.964694,0.0,0.964655
26660,corrupted_binary_experiment_subset,VAEImputer,923,MNAR,0.30,single_single,isns,downstream_performance_mean,F1_weighted,,,0.965209,0.0,0.965186
26661,corrupted_binary_experiment_subset,VAEImputer,923,MNAR,0.50,single_single,isns,downstream_performance_mean,F1_micro,,,0.947977,0.0,0.938343
26662,corrupted_binary_experiment_subset,VAEImputer,923,MNAR,0.50,single_single,isns,downstream_performance_mean,F1_macro,,,0.947272,0.0,0.937742


In [109]:
# Filtering the relevant data for downstream analysis

na_impute_results = results[
    (results["result_type"] == IMPUTE_RESULT_TYPE) & 
    (results["metric"].isin(["F1_macro", "RMSE"]))
]
na_impute_results.drop(["baseline", "corrupted", "imputed"], axis=1, inplace=True)
na_impute_results = na_impute_results[na_impute_results.isna().any(axis=1)]
na_impute_results.shape



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(0, 11)

In [110]:
# check if strategy type is correct!
STRATEGY_TYPE = "single_single"

downstream_results = results[
    (results["result_type"] == DOWNSTREAM_RESULT_TYPE) & 
    (results["metric"].isin(["F1_macro", "RMSE"]) &
    (results["strategy"] == STRATEGY_TYPE))
]

# remove experiments where imputation failed
downstream_results = downstream_results.merge(
    na_impute_results,
    how = "left",
    validate = "one_to_one",
    indicator = True,
    suffixes=("", "_imp"),
    on = ["experiment", "imputer", "task", "missing_type", "missing_fraction", "strategy", "column"]
)
downstream_results = downstream_results[downstream_results["_merge"]=="left_only"]

assert len(results["strategy"].unique()) == 1
downstream_results.drop(["experiment", "strategy", "result_type_imp", "metric_imp", "train", "test", "train_imp", "test_imp", "_merge"], axis=1, inplace=True)

downstream_results = downstream_results.rename(
    {
        "imputer": "Imputation_Method",
        "task": "Task",
        "missing_type": "Missing Type",
        "missing_fraction": "Missing Fraction",
        "column": "Column",
        "baseline": "Baseline",
        "imputed": "Imputed_Subset",
        "corrupted": "Corrupted"
    },
    axis = 1
)

In [111]:
rename_imputer_dict = {
    "ModeImputer": "Mean/Mode",
    "KNNImputer": "KNN",
    "ForestImputer": "Random Forest",
    "AutoKerasImputer": "Discriminative DL",
    "VAEImputer": "VAE",
    "GAINImputer": "GAIN"    
}

rename_metric_dict = {
    "F1_macro": CLF_METRIC,
    "RMSE": REG_METRIC
}

downstream_results = downstream_results.replace(rename_imputer_dict)
downstream_results = downstream_results.replace(rename_metric_dict)

downstream_results

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset
0,Discriminative DL,1046,MAR,0.01,end,downstream_performance_mean,Classification Tasks,0.803655,0.0,0.803655
1,Discriminative DL,1046,MAR,0.10,end,downstream_performance_mean,Classification Tasks,0.801405,0.0,0.804529
2,Discriminative DL,1046,MAR,0.30,end,downstream_performance_mean,Classification Tasks,0.805895,0.0,0.799645
3,Discriminative DL,1046,MCAR,0.01,end,downstream_performance_mean,Classification Tasks,0.799632,0.0,0.799632
4,Discriminative DL,1046,MCAR,0.30,end,downstream_performance_mean,Classification Tasks,0.785320,0.0,0.794331
...,...,...,...,...,...,...,...,...,...,...
2217,VAE,923,MCAR,0.50,isns,downstream_performance_mean,Classification Tasks,0.848246,0.0,0.861480
2218,VAE,923,MNAR,0.01,isns,downstream_performance_mean,Classification Tasks,0.964694,0.0,0.964694
2219,VAE,923,MNAR,0.10,isns,downstream_performance_mean,Classification Tasks,0.964694,0.0,0.964694
2220,VAE,923,MNAR,0.30,isns,downstream_performance_mean,Classification Tasks,0.964694,0.0,0.964655


In [112]:
#downstream_results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2222 entries, 0 to 2221
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Imputation_Method  2222 non-null   object 
 1   Task               2222 non-null   int64  
 2   Missing Type       2222 non-null   object 
 3   Missing Fraction   2222 non-null   float64
 4   Column             2222 non-null   object 
 5   result_type        2222 non-null   object 
 6   metric             2222 non-null   object 
 7   Baseline           2222 non-null   float64
 8   Corrupted          2222 non-null   float64
 9   Imputed_Subset     2222 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 191.0+ KB


### Robustness: Check which Imputers Yielded `NaN`Values

In [113]:
for col in downstream_results.columns:
    na_sum = downstream_results[col].isna().sum()
    if na_sum > 0:
        print("-----" * 10)        
        print(col, na_sum)
        print("-----" * 10)        
        na_idx = downstream_results[col].isna()
        print(downstream_results.loc[na_idx, "Imputation Method"].value_counts(dropna=False))
        print("\n")

## Adding Dataset Info, Sorting and Ranking

In [114]:
clf_row_idx = downstream_results["metric"] == CLF_METRIC
reg_row_idx = downstream_results["metric"] == REG_METRIC

# Sorting of data

#adjust order to fit the processing time -> fastest first
methods_order = CategoricalDtype(['Mean/Mode', 'KNN', 'Random Forest', 'VAE', 'GAIN', 'Discriminative DL'], ordered=True)
downstream_results_full_sort = downstream_results.copy()

downstream_results_full_sort['Imputation_Method'] = downstream_results_full_sort['Imputation_Method'].astype(methods_order)

downstream_results_full_sort = downstream_results_full_sort.sort_values(['Task', 'Missing Type',
                                                                         'Missing Fraction', 'Imputed_Subset','Imputation_Method'], ascending=[True, True, True, True, True])
downstream_results_full_sort


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset
31,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.695789,0.0,0.694867
1142,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074
398,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074
770,GAIN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698555,0.0,0.698555
1514,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217
...,...,...,...,...,...,...,...,...,...,...
1729,Mean/Mode,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.591024,0.0,0.592436
985,GAIN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.592603,0.0,0.592837
613,Random Forest,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.594657,0.0,0.595566
1357,KNN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.612391,0.0,0.606716


In [115]:
#downstream_results_full_sort.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2222 entries, 31 to 2101
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Imputation_Method  2222 non-null   category
 1   Task               2222 non-null   int64   
 2   Missing Type       2222 non-null   object  
 3   Missing Fraction   2222 non-null   float64 
 4   Column             2222 non-null   object  
 5   result_type        2222 non-null   object  
 6   metric             2222 non-null   object  
 7   Baseline           2222 non-null   float64 
 8   Corrupted          2222 non-null   float64 
 9   Imputed_Subset     2222 non-null   float64 
dtypes: category(1), float64(4), int64(1), object(4)
memory usage: 176.0+ KB


In [116]:
# add complete dataset information from other csv file
dataset_info = pd.read_csv('../datasets_information_overview.csv')
dataset_info = dataset_info.rename(columns={"did": "Task"})

downstream_results_full_sort = pd.merge(downstream_results_full_sort, dataset_info, on='Task')
#downstream_results_full_sort.head()

Unnamed: 0.1,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses
0,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.695789,0.0,0.694867,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,
2,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,
3,GAIN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698555,0.0,0.698555,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,
4,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,


In [117]:
# Ranking of downstream performance per data constellation for every imputation method

EXPERIMENTAL_CONDITIONS = ["Task", "Missing Type", "Missing Fraction", "Column", "result_type"]

downstream_results_rank = downstream_results_full_sort.copy()
downstream_results_rank["Downstream Performance Rank Subset"] = downstream_results_rank.groupby(EXPERIMENTAL_CONDITIONS).rank(ascending=False, na_option="bottom", method="first")["Imputed_Subset"]


# create csv for detailled checks
downstream_results_rank.to_csv('downstream_results_binary_complete_overview.csv')
downstream_results_rank.head()


Unnamed: 0.1,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset
0,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.695789,0.0,0.694867,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,6.0
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,4.0
2,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,5.0
3,GAIN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698555,0.0,0.698555,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,3.0
4,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,34,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,2.0


In [118]:
# Adjust column type for Imputation_Method
downstream_results_rank['Imputation_Method'] = downstream_results_rank['Imputation_Method'].astype('object')

#downstream_results_rank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2222 entries, 0 to 2221
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Imputation_Method                   2222 non-null   object 
 1   Task                                2222 non-null   int64  
 2   Missing Type                        2222 non-null   object 
 3   Missing Fraction                    2222 non-null   float64
 4   Column                              2222 non-null   object 
 5   result_type                         2222 non-null   object 
 6   metric                              2222 non-null   object 
 7   Baseline                            2222 non-null   float64
 8   Corrupted                           2222 non-null   float64
 9   Imputed_Subset                      2222 non-null   float64
 10  Unnamed: 0                          2222 non-null   int64  
 11  name                                2222 no

In [119]:
# Merge the two columns "Missing Type" and "Missing Fraction" 

downstream_results_rank['Missing Type'] = downstream_results_rank['Missing Type'].astype(str)
downstream_results_rank['Missing Fraction'] = downstream_results_rank['Missing Fraction'].astype(str)
#datatype_new = downstream_results_rank.dtypes

downstream_results_rank['Data_Constellation'] = downstream_results_rank['Missing Type'] + ' - ' + downstream_results_rank['Missing Fraction']
#downstream_results_rank.to_csv('downstream_results_rank_temp.csv')
downstream_results_rank_heatmap2 = downstream_results_rank.copy()
downstream_results_rank


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation
0,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.695789,0.0,0.694867,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,6.0,MAR - 0.01
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,4.0,MAR - 0.01
2,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.01
3,GAIN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698555,0.0,0.698555,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,3.0,MAR - 0.01
4,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,2.0,MAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2217,Mean/Mode,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.591024,0.0,0.592436,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,5.0,MNAR - 0.5
2218,GAIN,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.592603,0.0,0.592837,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,4.0,MNAR - 0.5
2219,Random Forest,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.594657,0.0,0.595566,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,3.0,MNAR - 0.5
2220,KNN,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.612391,0.0,0.606716,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,2.0,MNAR - 0.5


## Analyzing Performance Based on Rank per Data Constellation

In [120]:
data = downstream_results_rank.copy()

# Count amount of different Data constellations in column "Data_Constellation"
dc_unique = data.Data_Constellation.unique().size
print(dc_unique, "Data Constellations")
print("_____________________")
# Count amount of 1.0 Ranking result in column "Downstream Performance Rank" (Numbers must match)
rank_count = data['Downstream Performance Rank Subset'].value_counts()
print(rank_count)
print("_____________________")
# Filter for 1.0 Ranking -> Overview -> save as csv
rank_1 = data.loc[data['Downstream Performance Rank Subset'] == 1.0]
rank_1.to_csv('rank_1.csv')

print("_____________________")
# Count how often each Imputation Method is present -> most "wins"
rank_wins = rank_1['Imputation_Method'].value_counts()
print(rank_wins)
print("_____________________")



# BE AWARE THAT THE AVERAGE RANK DOES NOT CONSIDER MISSING RESULTS, WHICH RESULT IN THE WORST RANK BY DEFAULT
# Take initial overview and filter for each imputation method and calculate average rank
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
for i in methods:
    df_average_rank = data.loc[data['Imputation_Method'] == i]
    len_ar = len(df_average_rank)
    print(len_ar, "Amount of results available")
    rank_pos = df_average_rank['Downstream Performance Rank Subset'].value_counts().sort_index(ascending=True)
    print(rank_pos)
    average_rank = df_average_rank["Downstream Performance Rank Subset"].mean()
    print("Average Rank for", i, "is", average_rank)
    #average_improvement = df_average_rank["Imputed"].mean()
    #print("Average Improvement to baseline is", average_improvement)
    print("_____________________")

12 Data Constellations
_____________________
4.0    372
5.0    372
3.0    372
2.0    372
1.0    372
6.0    362
Name: Downstream Performance Rank Subset, dtype: int64
_____________________
_____________________
Mean/Mode            101
Random Forest         68
KNN                   66
Discriminative DL     49
VAE                   46
GAIN                  42
Name: Imputation_Method, dtype: int64
_____________________
372 Amount of results available
1.0    68
2.0    60
3.0    88
4.0    52
5.0    58
6.0    46
Name: Downstream Performance Rank Subset, dtype: int64
Average Rank for Random Forest is 3.295698924731183
_____________________
372 Amount of results available
1.0    66
2.0    88
3.0    70
4.0    55
5.0    53
6.0    40
Name: Downstream Performance Rank Subset, dtype: int64
Average Rank for KNN is 3.163978494623656
_____________________
372 Amount of results available
1.0    101
2.0     88
3.0     52
4.0     56
5.0     41
6.0     34
Name: Downstream Performance Rank Subset, dtype: i

In [121]:
rank_1_backup = rank_1.copy()
rank_1


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation
5,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699699,0.0,0.699699,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01
11,VAE,137,MAR,0.1,top-middle-square,downstream_performance_mean,Classification Tasks,0.700844,0.0,0.700844,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.1
17,VAE,137,MAR,0.3,top-middle-square,downstream_performance_mean,Classification Tasks,0.702163,0.0,0.700799,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.3
23,KNN,137,MAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.705702,0.0,0.706240,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.5
29,KNN,137,MCAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MCAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2197,Random Forest,42493,MCAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.606068,0.0,0.606759,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MCAR - 0.5
2201,Mean/Mode,42493,MNAR,0.01,Length,downstream_performance_mean,Classification Tasks,0.614237,0.0,0.614237,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.01
2208,Mean/Mode,42493,MNAR,0.1,Length,downstream_performance_mean,Classification Tasks,0.612068,0.0,0.614235,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.1
2215,Mean/Mode,42493,MNAR,0.3,Length,downstream_performance_mean,Classification Tasks,0.598213,0.0,0.600125,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.3


## Set Average Best Imputation Method Manually

In [122]:
# SET AVERAGE BEST IMPUTATION METHOD HERE, BASED ON THE PREVIOUS RESULTS
# Alternatively you can define a baseline method here, which will be used instead, depending on your analysis goals

AVERAGE_BEST_IMPUTATION_METHOD = 'Mean/Mode'

## Differences in Performance Relative to Average Best Imputation Method

In [123]:
av_best = data.loc[data['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
av_best['Task'] = av_best['Task'].astype(str)
av_best['Data_Constellation'] = av_best['Data_Constellation'] + ' - ' + av_best['Task']

av_best = av_best.rename(columns={'Imputation_Method':'Imputation_Method_average', 
                               'Imputed_Subset':'Imputed_average_Subset',
                                 'Downstream Performance Rank Subset':'Downstream Performance Rank Average Subset'})


rank_1['Task'] = rank_1['Task'].astype(str)
rank_1['Data_Constellation'] = rank_1['Data_Constellation'] + ' - ' + rank_1['Task']
rank_1 = rank_1[['Imputation_Method', 'Imputed_Subset', 'Data_Constellation', 'Downstream Performance Rank Subset']]
rank_1 = rank_1.rename(columns={'Imputation_Method':'Imputation_Method_best', 
                               'Imputed_Subset':'Imputed_best_Subset',
                               'Downstream Performance Rank Subset':'Downstream Performance Rank Best Subset'})

performance_difference = pd.merge(av_best, rank_1, on='Data_Constellation')
#performance_difference.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [124]:
# Calculate the difference between the best imputation method for each data constellation to the average best imputation method in F1 score
performance_difference['Performance Difference Best to Average'] = performance_difference['Imputed_best_Subset'] - performance_difference['Imputed_average_Subset']
Average_Difference = performance_difference['Performance Difference Best to Average'].mean()
print("Average Difference in Improvement from best method to average best method for F1", Average_Difference)


Average Difference in Improvement from best method to average best method for F1 0.017610637359559676


In [125]:
performance_difference.to_csv('performance_difference.csv')
performance_difference.head()

Unnamed: 0,Imputation_Method_average,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_average_Subset,...,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Average Subset,Data_Constellation,Imputation_Method_best,Imputed_best_Subset,Downstream Performance Rank Best Subset,Performance Difference Best to Average
0,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,39366.0,0.0,10.0,,2.0,MAR - 0.01 - 137,VAE,0.699699,1.0,0.000482
1,Mean/Mode,137,MAR,0.1,top-middle-square,downstream_performance_mean,Classification Tasks,0.697587,0.0,0.697368,...,39366.0,0.0,10.0,,5.0,MAR - 0.1 - 137,VAE,0.700844,1.0,0.003475
2,Mean/Mode,137,MAR,0.3,top-middle-square,downstream_performance_mean,Classification Tasks,0.695953,0.0,0.697526,...,39366.0,0.0,10.0,,5.0,MAR - 0.3 - 137,VAE,0.700799,1.0,0.003273
3,Mean/Mode,137,MAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.697094,0.0,0.697475,...,39366.0,0.0,10.0,,4.0,MAR - 0.5 - 137,KNN,0.70624,1.0,0.008765
4,Mean/Mode,137,MCAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,...,39366.0,0.0,10.0,,2.0,MCAR - 0.01 - 137,KNN,0.699217,1.0,0.001143


## Analysis and Ranking based on F1 Score

In [126]:
# Relative Difference in Percent -> Best Method to Average Best Method

data = downstream_results_rank.copy()
data['Task'] = data['Task'].astype(str)
data['Data_Constellation_full'] = data['Data_Constellation'] + ' - ' + data['Task']


dc_unique = data.Data_Constellation_full.unique()

data_constellations = dc_unique.tolist()
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
average_best_complete = pd.DataFrame()


for i in data_constellations:
    data_constel = data.loc[data['Data_Constellation_full'] == i]
    best_score = data_constel.loc[data_constel['Downstream Performance Rank Subset'] == 1.0]
    average_best = data_constel.loc[data_constel['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
    dataset_number = best_score.iloc[0]['Task']

    if (dataset_number != '4135'):
        best_score_int = best_score.iloc[0]['Imputed_Subset']
        average_best_int = average_best.iloc[0]['Imputed_Subset']

        calc_result = ((best_score_int - average_best_int)/average_best_int)
        average_best['Performance Difference to Best to Average in Percent'] = calc_result
        average_best_complete = average_best_complete.append(average_best)  
    else:
        print("4135 else ---------------------")

average_best_complete



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------
4135 else ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation,Data_Constellation_full,Performance Difference to Best to Average in Percent
4,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,13664.0,10.0,39366.0,0.0,10.0,,2.0,MAR - 0.01,MAR - 0.01 - 137,0.000689
7,Mean/Mode,137,MAR,0.1,top-middle-square,downstream_performance_mean,Classification Tasks,0.697587,0.0,0.697368,...,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.1,MAR - 0.1 - 137,0.004983
13,Mean/Mode,137,MAR,0.3,top-middle-square,downstream_performance_mean,Classification Tasks,0.695953,0.0,0.697526,...,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.3,MAR - 0.3 - 137,0.004693
20,Mean/Mode,137,MAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.697094,0.0,0.697475,...,13664.0,10.0,39366.0,0.0,10.0,,4.0,MAR - 0.5,MAR - 0.5 - 137,0.012567
27,Mean/Mode,137,MCAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,...,13664.0,10.0,39366.0,0.0,10.0,,2.0,MCAR - 0.01,MCAR - 0.01 - 137,0.001638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,Mean/Mode,42493,MCAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.599894,0.0,0.600162,...,12035.0,8.0,26969.0,2.0,6.0,,2.0,MCAR - 0.5,MCAR - 0.5 - 42493,0.010993
2201,Mean/Mode,42493,MNAR,0.01,Length,downstream_performance_mean,Classification Tasks,0.614237,0.0,0.614237,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.01,MNAR - 0.01 - 42493,0.000000
2208,Mean/Mode,42493,MNAR,0.1,Length,downstream_performance_mean,Classification Tasks,0.612068,0.0,0.614235,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.1,MNAR - 0.1 - 42493,0.000000
2215,Mean/Mode,42493,MNAR,0.3,Length,downstream_performance_mean,Classification Tasks,0.598213,0.0,0.600125,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.3,MNAR - 0.3 - 42493,0.000000


In [127]:
#Difference in Percentage
average_difference = average_best_complete['Performance Difference to Best to Average in Percent'].mean()
print(average_difference, "average difference in Percent")

0.02865669221543993 average difference in Percent


## Heatmap to Show Detailled Performance of Each Imputation Method for Each Data Constellation

In [128]:
df_heat = downstream_results_rank.copy()
df_heat.drop(["Missing Type", "Missing Fraction", "Column", "result_type", "metric", "Baseline", "Corrupted", "Unnamed: 0", "Unnamed: 0", "name", "NumberOfClasses", "MajorityClassSize", "MinorityClassSize"], axis=1, inplace=True)
df_heat

Unnamed: 0,Imputation_Method,Task,Imputed_Subset,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Downstream Performance Rank Subset,Data_Constellation
0,Discriminative DL,137,0.694867,10.0,39366.0,0.0,10.0,6.0,MAR - 0.01
1,KNN,137,0.698074,10.0,39366.0,0.0,10.0,4.0,MAR - 0.01
2,Random Forest,137,0.698074,10.0,39366.0,0.0,10.0,5.0,MAR - 0.01
3,GAIN,137,0.698555,10.0,39366.0,0.0,10.0,3.0,MAR - 0.01
4,Mean/Mode,137,0.699217,10.0,39366.0,0.0,10.0,2.0,MAR - 0.01
...,...,...,...,...,...,...,...,...,...
2217,Mean/Mode,42493,0.592436,8.0,26969.0,2.0,6.0,5.0,MNAR - 0.5
2218,GAIN,42493,0.592837,8.0,26969.0,2.0,6.0,4.0,MNAR - 0.5
2219,Random Forest,42493,0.595566,8.0,26969.0,2.0,6.0,3.0,MNAR - 0.5
2220,KNN,42493,0.606716,8.0,26969.0,2.0,6.0,2.0,MNAR - 0.5


In [129]:
# Heatmap for total F1 score for each data constellation for each method

df_heat = df_heat.astype({"Task":"string"})

data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MCAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']


for i in data_constellations:
    data_constel = df_heat.loc[df_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    F1_Score = data_constel["Imputed_Subset"]
    

    trace = go.Heatmap(
                   z=F1_Score,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'Reds',
                    zmin=0,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()

In [130]:
#downstream_results_rank_heatmap2
df_heat_dif = downstream_results_rank_heatmap2.copy()

In [131]:
# Calculate Difference for every Imputation towards average best Imputation Method per Data Constellation
# Calculation for F1 Score Differences (not Percentage)

data = downstream_results_rank.copy()
data['Task'] = data['Task'].astype(str)
data['Data_Constellation_full'] = data['Data_Constellation'] + ' - ' + data['Task']

dc_unique = data.Data_Constellation_full.unique()
#print(dc_unique)

data_constellations = dc_unique.tolist()
methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
heatmap_data_difference = pd.DataFrame()


for i in data_constellations:
    data_constel = data.loc[data['Data_Constellation_full'] == i]
    average_best = data_constel.loc[data_constel['Imputation_Method'] == AVERAGE_BEST_IMPUTATION_METHOD]
    dataset_number = best_score.iloc[0]['Task']
    for i in methods:
        if ((data_constel['Imputation_Method'] == i).any()):
            if average_best.empty:#dataset_number != '4135'
                print("No average best result for", dataset_number)
                #print(dataset_number)  
            else:
                current_score_row = data_constel.loc[data['Imputation_Method'] == i]
                current_score_int = current_score_row.iloc[0]['Imputed_Subset']
                average_best_int = average_best.iloc[0]['Imputed_Subset']
                calc_result = (current_score_int - average_best_int)
            
                current_score_row['Performance Difference to Average Best'] = calc_result
                heatmap_data_difference = heatmap_data_difference.append(current_score_row)               
    
        else:
            print(dataset_number,"Imputation Method not here ---------------------")



heatmap_data_difference['Missing Type'] = heatmap_data_difference['Missing Type'].astype(str)
heatmap_data_difference['Missing Fraction'] = heatmap_data_difference['Missing Fraction'].astype(str)
heatmap_data_difference



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

42493 Imputation Method not here ---------------------
42493 Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

42493 Imputation Method not here ---------------------
42493 Imputation Method not here ---------------------
42493 Imputation Method not here ---------------------
42493 Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

42493 Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

42493 Imputation Method not here ---------------------
42493 Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

42493 Imputation Method not here ---------------------




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
2,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,...,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.01,MAR - 0.01 - 137,-0.001143
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698074,0.0,0.698074,...,13664.0,10.0,39366.0,0.0,10.0,,4.0,MAR - 0.01,MAR - 0.01 - 137,-0.001143
4,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,13664.0,10.0,39366.0,0.0,10.0,,2.0,MAR - 0.01,MAR - 0.01 - 137,0.000000
5,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699699,0.0,0.699699,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000482
3,GAIN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.698555,0.0,0.698555,...,13664.0,10.0,39366.0,0.0,10.0,,3.0,MAR - 0.01,MAR - 0.01 - 137,-0.000662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,KNN,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.612391,0.0,0.606716,...,12035.0,8.0,26969.0,2.0,6.0,,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,0.014281
2217,Mean/Mode,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.591024,0.0,0.592436,...,12035.0,8.0,26969.0,2.0,6.0,,5.0,MNAR - 0.5,MNAR - 0.5 - 42493,0.000000
2221,VAE,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.610725,0.0,0.611129,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,0.018693
2218,GAIN,42493,MNAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.592603,0.0,0.592837,...,12035.0,8.0,26969.0,2.0,6.0,,4.0,MNAR - 0.5,MNAR - 0.5 - 42493,0.000401


In [132]:
# Heatmap for F1 score differences for each data constellation for each method relative to average best imputation method

heatmap_data_difference = heatmap_data_difference.astype({"Task":"string"})
data_constellations = ['MAR - 0.01', 'MAR - 0.1', 'MAR - 0.3', 'MAR - 0.5', 'MCAR - 0.01', 'MCAR - 0.1', 'MCAR - 0.3', 'MCAR - 0.5', 'MNAR - 0.01', 'MNAR - 0.1', 'MNAR - 0.3', 'MNAR - 0.5']

for i in data_constellations:
    data_constel = heatmap_data_difference.loc[df_heat['Data_Constellation'] == i]

    ### uncomment whatever you want to investigate

    ## sort by amount datapoints (ascending)
    data_constel = data_constel.sort_values(by=['NumberOfInstances'])

    ## sort by amount of features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfFeatures'])

    ## sort by amount of datapoints and features (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfInstances', 'NumberOfFeatures'])

    ## sort by amount of categorical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfCategoricalFeatures', 'NumberOfInstances'])

    ## sort by amount of numerical features and datapoints (ascending)
    #data_constel = data_constel.sort_values(by=['NumberOfNumericFeatures', 'NumberOfInstances'])
    
    Dataset_number = data_constel["Task"]
    Imputation_Method = data_constel["Imputation_Method"]
    Improvement = data_constel["Performance Difference to Average Best"]
    

    trace = go.Heatmap(
                   z=Improvement,
                   x=Dataset_number,
                   y=Imputation_Method,
                   type = 'heatmap',
                    autocolorscale= False,
                    colorscale = 'RdBu_r',
                    zmid=0,
                    zmin=(-0.11),
                    zmax=0.11,
                    )
    data = [trace]
    fig = go.Figure(data=data)
    fig.update_layout(
        title=i,
        xaxis_nticks=36)
    fig.show()
    fig.write_image("binary_subset_heatmap_f1_score_improvement_to_avbest%s.pdf" %i)

In [133]:
#heatmap_data_difference.agg(['min', 'max'])

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
min,Discriminative DL,1046,MAR,0.01,Cell_Shape_Uniformity,downstream_performance_mean,Classification Tasks,0.33587,0.0,0.331796,...,260.0,5.0,3107.0,0.0,1.0,,1.0,MAR - 0.01,MAR - 0.01 - 1046,-0.208714
max,VAE,923,MNAR,0.5,year,downstream_performance_mean,Classification Tasks,0.994156,0.0,0.994156,...,47662.0,24.0,96320.0,23.0,10.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 923,0.232155


## Improvment Proportions for All Data Constellations and Methods Relative to Average Best Method

In [134]:
heatmap_data_difference
heatmap_data_difference.to_csv('binary_subset_full_info.csv', index=False)

In [135]:
# sorting data by total improvement
df_quantiles = heatmap_data_difference.copy()

df_quantiles = df_quantiles.drop(df_quantiles[df_quantiles["Imputation_Method"] == AVERAGE_BEST_IMPUTATION_METHOD].index)


df_10 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] > (-0.09))].index)
df_09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.09)) | (df_quantiles["Performance Difference to Average Best"] > (-0.07))].index)
df_07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.07)) | (df_quantiles["Performance Difference to Average Best"] > (-0.05))].index)
df_05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.05)) | (df_quantiles["Performance Difference to Average Best"] > (-0.03))].index)
df_03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.03)) | (df_quantiles["Performance Difference to Average Best"] > (-0.01))].index)
df_01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (-0.01)) | (df_quantiles["Performance Difference to Average Best"] > (0.01))].index)
df01 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.01)) | (df_quantiles["Performance Difference to Average Best"] > (0.03))].index)
df03 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.03)) | (df_quantiles["Performance Difference to Average Best"] > (0.05))].index)
df05 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.05)) | (df_quantiles["Performance Difference to Average Best"] > (0.07))].index)
df07 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.07)) | (df_quantiles["Performance Difference to Average Best"] > (0.09))].index)
df09 = df_quantiles.drop(df_quantiles[(df_quantiles["Performance Difference to Average Best"] <= (0.09))].index)

#df_quantiles
#df_quantiles.dtypes

In [136]:
len_df_10 = len(df_10.index)
len_df_09 = len(df_09.index)
len_df_07 = len(df_07.index)
len_df_05 = len(df_05.index)
len_df_03 = len(df_03.index)
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []
#quantile_freq.append()
quantile_freq.extend((len_df_10, len_df_09, len_df_07, len_df_05, len_df_03, len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Improvement to Average Best': quantiles,
     'Amount': quantile_freq,
    })


[32, 16, 41, 98, 204, 1170, 166, 47, 28, 11, 37]
['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03', '-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [137]:
fig = px.bar(improvement_quantiles, x='Improvement to Average Best', y='Amount')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl.pdf")

In [138]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'VAE', 'GAIN', 'Discriminative DL']
[4, 3, 9, 13, 43, 224, 44, 11, 9, 3, 9]
[6, 3, 10, 22, 36, 233, 37, 10, 3, 2, 10]
[]
[8, 4, 7, 16, 38, 235, 35, 7, 3, 3, 6]
[6, 4, 8, 21, 44, 238, 29, 4, 10, 2, 6]
[8, 2, 7, 26, 43, 240, 21, 15, 3, 1, 6]


In [139]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']

fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_per_method.pdf")

In [140]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)
#print(df_10)

freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[3, 4, 1, 10, 19, 371, 28, 5, 6, 1, 14]
[11, 3, 10, 22, 38, 320, 43, 7, 0, 1, 8]
[11, 2, 14, 29, 78, 254, 43, 16, 9, 1, 6]
[7, 7, 16, 37, 69, 225, 52, 19, 13, 8, 9]


In [141]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_per_frac.pdf")

In [142]:
# split barchart stacks into methods

quantile_datasets = [df_10, df_09, df_07, df_05, df_03, df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)
#print(df_10)

freq_001 = []
freq_01 = []
freq_03 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                 

                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[12, 5, 18, 40, 79, 393, 40, 15, 6, 0, 9]
[9, 2, 13, 25, 63, 387, 64, 17, 11, 9, 17]
[11, 9, 10, 33, 62, 390, 62, 15, 11, 2, 11]


In [143]:
quantiles = ['less than -0.09', '-0.09 to -0.07', '-0.07 to -0.05', '-0.05 to -0.03','-0.03 to -0.01', '-0.01 to 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_per_patt.pdf")

## Improvment Proportions for the Best Imputation Method per Data Constellation Relative to Average Best Method

In [144]:
improv_to_av_bar = heatmap_data_difference.copy()

improv_to_av_bar = improv_to_av_bar.drop(improv_to_av_bar[improv_to_av_bar["Downstream Performance Rank Subset"] != 1.0].index)

df_01 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (-0.01)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.01))].index)
df01 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.01)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.03))].index)
df03 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.03)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.05))].index)
df05 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.05)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.07))].index)
df07 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.07)) | (improv_to_av_bar["Performance Difference to Average Best"] > (0.09))].index)
df09 = improv_to_av_bar.drop(improv_to_av_bar[(improv_to_av_bar["Performance Difference to Average Best"] <= (0.09))].index)

improv_to_av_bar

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
5,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699699,0.0,0.699699,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000482
11,VAE,137,MAR,0.1,top-middle-square,downstream_performance_mean,Classification Tasks,0.700844,0.0,0.700844,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.1,MAR - 0.1 - 137,0.003475
17,VAE,137,MAR,0.3,top-middle-square,downstream_performance_mean,Classification Tasks,0.702163,0.0,0.700799,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.3,MAR - 0.3 - 137,0.003273
23,KNN,137,MAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.705702,0.0,0.706240,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.5,MAR - 0.5 - 137,0.008765
29,KNN,137,MCAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MCAR - 0.01,MCAR - 0.01 - 137,0.001143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2197,Random Forest,42493,MCAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.606068,0.0,0.606759,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MCAR - 0.5,MCAR - 0.5 - 42493,0.006597
2201,Mean/Mode,42493,MNAR,0.01,Length,downstream_performance_mean,Classification Tasks,0.614237,0.0,0.614237,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.01,MNAR - 0.01 - 42493,0.000000
2208,Mean/Mode,42493,MNAR,0.1,Length,downstream_performance_mean,Classification Tasks,0.612068,0.0,0.614235,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.1,MNAR - 0.1 - 42493,0.000000
2215,Mean/Mode,42493,MNAR,0.3,Length,downstream_performance_mean,Classification Tasks,0.598213,0.0,0.600125,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.3,MNAR - 0.3 - 42493,0.000000


In [145]:
len_df_01 = len(df_01.index)
len_df01 = len(df01.index)
len_df03 = len(df03.index)
len_df05 = len(df05.index)
len_df07 = len(df07.index)
len_df09 = len(df09.index)

quantile_freq = []
quantile_freq.extend((len_df_01, len_df01, len_df03, len_df05, len_df07, len_df09))
print(quantile_freq)


quantiles = []
quantiles.extend(['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09'])
print(quantiles)

improvement_quantiles = pd.DataFrame(
    {'Improvement to Average Best': quantiles,
     'Amount': quantile_freq,
    })

fig = px.bar(improvement_quantiles, x='Improvement to Average Best', y='Amount')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best.pdf")

[243, 67, 23, 15, 6, 18]
['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


In [146]:
# split barchart stacks into methods

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

methods = ['Random Forest', 'KNN', 'Mean/Mode', 'VAE', 'GAIN', 'Discriminative DL']
methods.remove(AVERAGE_BEST_IMPUTATION_METHOD)
print(methods)

forest_freq = []
knn_freq = []
mode_freq = []
dl_freq = []
vae_freq = []
gain_freq = []
#print(quantile_datasets)

for i in methods:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Imputation_Method'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'Random Forest'):
            forest_freq.append(df_temp_len)
        elif (i == 'KNN'):
            knn_freq.append(df_temp_len)                                       
        elif (i == 'Mean/Mode'):
            mode_freq.append(df_temp_len)                                                 
        elif (i == 'Discriminative DL'):
            dl_freq.append(df_temp_len)                                       
        elif (i == 'VAE'):
            vae_freq.append(df_temp_len)                                         
        elif (i == 'GAIN'):
            gain_freq.append(df_temp_len)                                          
                                       
print(forest_freq)
print(knn_freq)
print(mode_freq)
print(dl_freq)
print(vae_freq)
print(gain_freq)

['Random Forest', 'KNN', 'VAE', 'GAIN', 'Discriminative DL']
[29, 18, 8, 5, 2, 6]
[34, 19, 3, 2, 0, 8]
[]
[27, 15, 4, 1, 1, 1]
[26, 9, 1, 7, 2, 1]
[26, 6, 7, 0, 1, 2]


In [147]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='Random Forest', x=quantiles, y=forest_freq),
    go.Bar(name='KNN', x=quantiles, y=knn_freq),
    go.Bar(name='Mean/Mode', x=quantiles, y=mode_freq),
    go.Bar(name='Discriminative DL', x=quantiles, y=dl_freq),
    go.Bar(name='VAE', x=quantiles, y=vae_freq),
    go.Bar(name='GAIN', x=quantiles, y=gain_freq)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best_per_method.pdf")

In [148]:
# split barchart stacks into missingness fractions

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

fractions = ['0.01', '0.1', '0.3', '0.5']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
freq_05 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Fraction'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == '0.01'):
            freq_001.append(df_temp_len)
        elif (i == '0.1'):
            freq_01.append(df_temp_len)                                       
        elif (i == '0.3'):
            freq_03.append(df_temp_len)                                                 
        elif (i == '0.5'):
            freq_05.append(df_temp_len)                                       
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)
print(freq_05)

['0.01', '0.1', '0.3', '0.5']
[71, 12, 1, 2, 0, 7]
[61, 24, 4, 0, 0, 4]
[58, 16, 7, 8, 1, 3]
[53, 15, 11, 5, 5, 4]


In [149]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='1% Missing Data', x=quantiles, y=freq_001, marker_color='#FD3216'),
    go.Bar(name='10% Missing Data', x=quantiles, y=freq_01, marker_color='#00FE35'),
    go.Bar(name='30% Missing Data', x=quantiles, y=freq_03, marker_color='#511CFB'),
    go.Bar(name='50% Missing Data', x=quantiles, y=freq_05, marker_color='#FF7F0E'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best_per_frac.pdf")

In [150]:
# split barchart stacks into missingness fractions

quantile_datasets = [df_01, df01, df03, df05, df07, df09]

fractions = ['MCAR', 'MAR', 'MNAR']
print(fractions)


freq_001 = []
freq_01 = []
freq_03 = []
#print(quantile_datasets)

for i in fractions:
    for j in quantile_datasets:
        df_temp = j.copy()
        df_temp = df_temp[df_temp['Missing Type'].str.contains(i)]
        df_temp_len = len(df_temp.index)
        if (i == 'MCAR'):
            freq_001.append(df_temp_len)
        elif (i == 'MAR'):
            freq_01.append(df_temp_len)                                       
        elif (i == 'MNAR'):
            freq_03.append(df_temp_len)                                                                                    
                                        
                                       
print(freq_001)
print(freq_01)
print(freq_03)

['MCAR', 'MAR', 'MNAR']
[92, 17, 6, 3, 0, 6]
[70, 26, 9, 5, 5, 9]
[81, 24, 8, 7, 1, 3]


In [151]:
quantiles = ['less than 0.01', '0.01 to 0.03', '0.03 to 0.05', '0.05 to 0.07', '0.07 to 0.09', 'more than 0.09']


fig = go.Figure(data=[
    go.Bar(name='MCAR', x=quantiles, y=freq_001, marker_color='#222A2A'),
    go.Bar(name='MAR', x=quantiles, y=freq_01, marker_color='#B68100'),
    go.Bar(name='MNAR', x=quantiles, y=freq_03, marker_color='#750D86'),
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.show()
fig.write_image("sub_improv_rel_to_av_all_DC_no_av_incl_only_best_per_patt.pdf")

## Extract datasets for Automated Imputation Method Selection -> not used in this theses

To Do: Explore the possibility, that the average best method replaces the best method for a data constellation, if the improvement gain for the best method is below 1%

### Potential Features:
Missingess Pattern (Missing Type)  
Missing Fraction (Missing Fraction)  
Datapoints (NumberOfInstances)  
Features in total (NumberOfFeatures)  
Numeric Features (NumberOfNumericFeatures)  
Categorical Features (NumberOfCategoricalFeatures)  
Downstream Task Type -> Classification/Regression (metric)
  
    
      
Label: Best Imputation Method (Imputation_Method)

In [152]:
# Use dataset with only the best method for each data constellation
rank_1_backup.to_csv('rank_1_backup.csv')
rank_1_backup

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed_Subset,...,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank Subset,Data_Constellation
5,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699699,0.0,0.699699,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01
11,VAE,137,MAR,0.1,top-middle-square,downstream_performance_mean,Classification Tasks,0.700844,0.0,0.700844,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.1
17,VAE,137,MAR,0.3,top-middle-square,downstream_performance_mean,Classification Tasks,0.702163,0.0,0.700799,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.3
23,KNN,137,MAR,0.5,top-middle-square,downstream_performance_mean,Classification Tasks,0.705702,0.0,0.706240,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.5
29,KNN,137,MCAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.699217,0.0,0.699217,...,BNG(tic-tac-toe),25702.0,13664.0,10.0,39366.0,0.0,10.0,,1.0,MCAR - 0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2197,Random Forest,42493,MCAR,0.5,Length,downstream_performance_mean,Classification Tasks,0.606068,0.0,0.606759,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MCAR - 0.5
2201,Mean/Mode,42493,MNAR,0.01,Length,downstream_performance_mean,Classification Tasks,0.614237,0.0,0.614237,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.01
2208,Mean/Mode,42493,MNAR,0.1,Length,downstream_performance_mean,Classification Tasks,0.612068,0.0,0.614235,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.1
2215,Mean/Mode,42493,MNAR,0.3,Length,downstream_performance_mean,Classification Tasks,0.598213,0.0,0.600125,...,airlines,14934.0,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.3


In [153]:
# Dataset for Training 
properties_train_dataset_1 = rank_1_backup.copy()
properties_train_dataset_1 = properties_train_dataset_1[['Imputation_Method','Missing Type','Missing Fraction',
                                                         'NumberOfInstances','NumberOfFeatures','NumberOfNumericFeatures',
                                                         'NumberOfCategoricalFeatures','metric']]

properties_train_dataset_1


Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfFeatures,NumberOfNumericFeatures,NumberOfCategoricalFeatures,metric
5,VAE,MAR,0.01,39366.0,10.0,0.0,10.0,Classification Tasks
11,VAE,MAR,0.1,39366.0,10.0,0.0,10.0,Classification Tasks
17,VAE,MAR,0.3,39366.0,10.0,0.0,10.0,Classification Tasks
23,KNN,MAR,0.5,39366.0,10.0,0.0,10.0,Classification Tasks
29,KNN,MCAR,0.01,39366.0,10.0,0.0,10.0,Classification Tasks
...,...,...,...,...,...,...,...,...
2197,Random Forest,MCAR,0.5,26969.0,8.0,2.0,6.0,Classification Tasks
2201,Mean/Mode,MNAR,0.01,26969.0,8.0,2.0,6.0,Classification Tasks
2208,Mean/Mode,MNAR,0.1,26969.0,8.0,2.0,6.0,Classification Tasks
2215,Mean/Mode,MNAR,0.3,26969.0,8.0,2.0,6.0,Classification Tasks


In [154]:
# Dataset for Training 
properties_train_dataset_7 = rank_1_backup.copy()
properties_train_dataset_7 = properties_train_dataset_7[['Imputation_Method','Missing Type','Missing Fraction',
                                                         'NumberOfInstances','NumberOfFeatures','NumberOfNumericFeatures',
                                                         'NumberOfCategoricalFeatures']]

properties_train_dataset_7
properties_train_dataset_7.to_csv('binary_properties_train_dataset_7.csv', index=False)

In [155]:
# Dataset for Training 

In [156]:
# Dataset for Training 