In [120]:
#cleaned

# Imputation Method Selection based on Dataset Properties for Binary Classification

In [121]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns
from pandas.api.types import CategoricalDtype
from pathlib import Path

from numpy import mean
from numpy import std

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr

from sklearn.compose import make_column_transformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import tensorflow as tf

import csv
from sklearn.utils import Bunch
from sklearn import metrics
from math import sqrt

from numpy import loadtxt
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from keras.utils import to_categorical 

## Training

In [122]:
data = pd.read_csv('binary_properties_train_dataset_original.csv')

data

Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfFeatures,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Random Forest,MAR,0.01,39366.0,10.0,0.0,10.0
1,KNN,MAR,0.10,39366.0,10.0,0.0,10.0
2,Random Forest,MAR,0.30,39366.0,10.0,0.0,10.0
3,KNN,MAR,0.50,39366.0,10.0,0.0,10.0
4,VAE,MCAR,0.01,39366.0,10.0,0.0,10.0
...,...,...,...,...,...,...,...
367,Random Forest,MCAR,0.50,26969.0,8.0,2.0,6.0
368,VAE,MNAR,0.01,26969.0,8.0,2.0,6.0
369,VAE,MNAR,0.10,26969.0,8.0,2.0,6.0
370,Discriminative DL,MNAR,0.30,26969.0,8.0,2.0,6.0


In [123]:
# OneHot-encoder -> multiple column approach
# 

transformer = make_column_transformer(
    (OneHotEncoder(), ['Imputation_Method', 'Missing Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
data_preprocessed = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
data_preprocessed


Unnamed: 0,onehotencoder__Imputation_Method_Discriminative DL,onehotencoder__Imputation_Method_GAIN,onehotencoder__Imputation_Method_KNN,onehotencoder__Imputation_Method_Mean/Mode,onehotencoder__Imputation_Method_Random Forest,onehotencoder__Imputation_Method_VAE,onehotencoder__Missing Type_MAR,onehotencoder__Missing Type_MCAR,onehotencoder__Missing Type_MNAR,remainder__Missing Fraction,remainder__NumberOfInstances,remainder__NumberOfFeatures,remainder__NumberOfNumericFeatures,remainder__NumberOfCategoricalFeatures
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.01,39366.0,10.0,0.0,10.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.10,39366.0,10.0,0.0,10.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.30,39366.0,10.0,0.0,10.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50,39366.0,10.0,0.0,10.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.01,39366.0,10.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.50,26969.0,8.0,2.0,6.0
368,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.01,26969.0,8.0,2.0,6.0
369,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.10,26969.0,8.0,2.0,6.0
370,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.30,26969.0,8.0,2.0,6.0


In [124]:
# Umwandlung in numpy arrays
X = data_preprocessed[['onehotencoder__Missing Type_MAR', 'onehotencoder__Missing Type_MCAR', 
                         'onehotencoder__Missing Type_MNAR', 'remainder__Missing Fraction', 
                         'remainder__NumberOfInstances',  
                         'remainder__NumberOfNumericFeatures', 'remainder__NumberOfCategoricalFeatures']].to_numpy()
#X = data_preprocessed_x
y = data[['Imputation_Method']].to_numpy()
print(np.unique(y))

# train/test split --> 20 % Testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train.shape
#print(X_train)

['Discriminative DL' 'GAIN' 'KNN' 'Mean/Mode' 'Random Forest' 'VAE']


(297, 7)

In [125]:
y_train.shape

(297, 1)

In [126]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(y)



[[ 1.43591632e+00 -7.39509973e-01 -6.85782057e-01 ...  7.71231779e-01
  -1.10012865e-03 -5.46433986e-01]
 [ 1.43591632e+00 -7.39509973e-01 -6.85782057e-01 ... -8.34531274e-01
  -4.91207443e-01 -5.46433986e-01]
 [ 1.43591632e+00 -7.39509973e-01 -6.85782057e-01 ... -6.59227157e-01
  -6.54576547e-01 -5.46433986e-01]
 ...
 [-6.96419414e-01  1.35224681e+00 -6.85782057e-01 ...  4.58325697e-01
  -1.47142207e+00  2.48389391e+00]
 [ 1.43591632e+00 -7.39509973e-01 -6.85782057e-01 ...  3.37635219e-01
   2.28606734e+00 -5.46433986e-01]
 [-6.96419414e-01  1.35224681e+00 -6.85782057e-01 ... -7.03325931e-02
  -1.64469233e-01 -5.46433986e-01]]
[['Random Forest']
 ['KNN']
 ['Random Forest']
 ['KNN']
 ['VAE']
 ['KNN']
 ['KNN']
 ['Random Forest']
 ['VAE']
 ['KNN']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['Discriminative DL']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['

In [127]:
X_train.shape

(297, 7)

In [128]:
y.shape

(372, 1)

## Original Experiment Results

For the training we use the original experiment results here

### Random Forest with Nested Cross Validation, with GridSearch

In [135]:
X_train = X
y_train = y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
outer_results = list()
predicitons_list = list()
test_data_list = list()
ground_truth_list = list()

inner_nested_cv_results= []
inner_nested_best_score= []
inner_nested_best_params= []
inner_nested_test_score = []

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [50, 100, 500, 1000, 10000, 30000],#, 3000, 4000, 5000, 6000, 7000],
#    'max_features': ['sqrt', 'log2'],
#    'criterion' :['gini', 'entropy'],
        'max_depth': [1,3,5,9,15,50]#,3,4,5,6,7]
    }

    grid_clf = GridSearchCV(clf, param_grid, cv=inner_cv)
    result = grid_clf.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_
    print(grid_clf.best_params_)
    print(best_model)

    inner_nested_cv_results.append(grid_clf.cv_results_)
    inner_nested_best_score.append(grid_clf.best_score_)
    inner_nested_best_params.append(grid_clf.best_params_)
    inner_nested_test_score.append(grid_clf.score(X_test, y_test))

    yhat = best_model.predict(X_test)
#    print(yhat)
#    print('___________________----------\n')
#    print(X_test)
#    print('___________________----------\n')
#    print(y_test)
    acc = accuracy_score(y_test, yhat)
    outer_results.append(acc)
    predicitons_list.append(yhat)
    test_data_list.append(X_test)
    ground_truth_list.append(y_test)
    
   
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print('_____________________________')
    print('\n')
    
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))    



{'max_depth': 9, 'n_estimators': 50}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=50,
                       random_state=42)
>acc=0.211, est=0.254, cfg={'max_depth': 9, 'n_estimators': 50}
_____________________________


{'max_depth': 9, 'n_estimators': 50}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=50,
                       random_state=42)
>acc=0.237, est=0.255, cfg={'max_depth': 9, 'n_estimators': 50}
_____________________________


{'max_depth': 9, 'n_estimators': 100}
RandomForestClassifier(class_weight='balanced', max_depth=9, random_state=42)
>acc=0.135, est=0.307, cfg={'max_depth': 9, 'n_estimators': 100}
_____________________________


{'max_depth': 5, 'n_estimators': 10000}
RandomForestClassifier(class_weight='balanced', max_depth=5, n_estimators=10000,
                       random_state=42)
>acc=0.297, est=0.242, cfg={'max_depth': 5, 'n_estimators': 10000}
_____________________________


{'max_depth': 5, 'n_estim

In [136]:
print(type(yhat))
print(type(X_test))
print(type(y_test))
print(outer_results)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.21052631578947367, 0.23684210526315788, 0.13513513513513514, 0.2972972972972973, 0.2972972972972973, 0.13513513513513514, 0.2702702702702703, 0.13513513513513514, 0.2972972972972973, 0.24324324324324326]


In [137]:
#np.mean(inner_nested_test_score)

In [138]:
#inner_nested_cv_results

In [139]:
inner_nested_best_params

[{'max_depth': 9, 'n_estimators': 50},
 {'max_depth': 9, 'n_estimators': 50},
 {'max_depth': 9, 'n_estimators': 100},
 {'max_depth': 5, 'n_estimators': 10000},
 {'max_depth': 5, 'n_estimators': 10000},
 {'max_depth': 9, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 10000},
 {'max_depth': 9, 'n_estimators': 50},
 {'max_depth': 50, 'n_estimators': 50},
 {'max_depth': 3, 'n_estimators': 10000}]

In [140]:
inner_nested_test_score

[0.21052631578947367,
 0.23684210526315788,
 0.13513513513513514,
 0.2972972972972973,
 0.2972972972972973,
 0.13513513513513514,
 0.2702702702702703,
 0.13513513513513514,
 0.2972972972972973,
 0.24324324324324326]

In [225]:
# Manually select the right model (best accuracy)

outer_results_item = outer_results[3]
print(outer_results_item)
df_predict_item = predicitons_list[3]
#df_predict_item
test_data_item = test_data_list[3]
ground_truth_item = ground_truth_list[3]

0.2972972972972973


In [226]:
df_predict = pd.DataFrame()
df_predict['prediction'] = df_predict_item
df_predict

Unnamed: 0,prediction
0,Random Forest
1,KNN
2,Discriminative DL
3,Discriminative DL
4,Discriminative DL
5,Mean/Mode
6,VAE
7,VAE
8,Mean/Mode
9,Discriminative DL


In [227]:
df_testdata = pd.DataFrame({'MAR': test_data_item[:, 0], 'MCAR': test_data_item[:, 1], 'MNAR': test_data_item[:, 2], 'Missing Fraction': test_data_item[:, 3], 'NumberOfInstances': test_data_item[:, 4],'NumberOfNumericFeatures': test_data_item[:, 5], 'NumberOfCategoricalFeatures': test_data_item[:, 6]})
df_testdata


Unnamed: 0,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,0.0,1.0,0.0,0.3,39366.0,0.0,10.0
1,0.0,0.0,1.0,0.3,39366.0,0.0,10.0
2,0.0,1.0,0.0,0.3,45312.0,7.0,2.0
3,0.0,1.0,0.0,0.5,45312.0,7.0,2.0
4,0.0,1.0,0.0,0.5,39366.0,9.0,1.0
5,1.0,0.0,0.0,0.01,11183.0,6.0,1.0
6,0.0,0.0,1.0,0.5,8192.0,8.0,1.0
7,0.0,1.0,0.0,0.5,3107.0,6.0,1.0
8,0.0,0.0,1.0,0.1,3107.0,6.0,1.0
9,0.0,1.0,0.0,0.5,20640.0,8.0,1.0


In [228]:
model_prediction = pd.DataFrame()
model_prediction = pd.concat([df_predict, df_testdata], axis=1)

#model_prediction.to_csv('model_prediction.csv', index=False)
model_prediction

Unnamed: 0,prediction,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Random Forest,0.0,1.0,0.0,0.3,39366.0,0.0,10.0
1,KNN,0.0,0.0,1.0,0.3,39366.0,0.0,10.0
2,Discriminative DL,0.0,1.0,0.0,0.3,45312.0,7.0,2.0
3,Discriminative DL,0.0,1.0,0.0,0.5,45312.0,7.0,2.0
4,Discriminative DL,0.0,1.0,0.0,0.5,39366.0,9.0,1.0
5,Mean/Mode,1.0,0.0,0.0,0.01,11183.0,6.0,1.0
6,VAE,0.0,0.0,1.0,0.5,8192.0,8.0,1.0
7,VAE,0.0,1.0,0.0,0.5,3107.0,6.0,1.0
8,Mean/Mode,0.0,0.0,1.0,0.1,3107.0,6.0,1.0
9,Discriminative DL,0.0,1.0,0.0,0.5,20640.0,8.0,1.0


In [229]:
one_hot_reverse = model_prediction.copy()
one_hot_reverse = one_hot_reverse[['MAR', 'MCAR', 'MNAR']]

one_hot_reverse['Missing Type'] = one_hot_reverse.idxmax(1)
one_hot_reverse = one_hot_reverse[['Missing Type']]
one_hot_reverse

Unnamed: 0,Missing Type
0,MCAR
1,MNAR
2,MCAR
3,MCAR
4,MCAR
5,MAR
6,MNAR
7,MCAR
8,MNAR
9,MCAR


In [230]:
model_prediction = pd.concat([model_prediction, one_hot_reverse], axis=1)
model_prediction = model_prediction.drop(['MAR', 'MCAR', 'MNAR'], axis=1)
model_prediction['control'] = 20
model_prediction = model_prediction.rename(columns={"prediction": "Imputation_Method"})

model_prediction

Unnamed: 0,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control
0,Random Forest,0.3,39366.0,0.0,10.0,MCAR,20
1,KNN,0.3,39366.0,0.0,10.0,MNAR,20
2,Discriminative DL,0.3,45312.0,7.0,2.0,MCAR,20
3,Discriminative DL,0.5,45312.0,7.0,2.0,MCAR,20
4,Discriminative DL,0.5,39366.0,9.0,1.0,MCAR,20
5,Mean/Mode,0.01,11183.0,6.0,1.0,MAR,20
6,VAE,0.5,8192.0,8.0,1.0,MNAR,20
7,VAE,0.5,3107.0,6.0,1.0,MCAR,20
8,Mean/Mode,0.1,3107.0,6.0,1.0,MNAR,20
9,Discriminative DL,0.5,20640.0,8.0,1.0,MCAR,20


In [231]:
data_control = pd.read_csv('binary_imputed_full_info.csv')


data_control


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000000e+00
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674760,...,13664.0,10.0,39366.0,0.0,10.0,,2.0,MAR - 0.01,MAR - 0.01 - 137,-2.949553e-08
2,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674145,0.0,0.674116,...,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.01,MAR - 0.01 - 137,-6.439471e-04
3,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674488,...,13664.0,10.0,39366.0,0.0,10.0,,3.0,MAR - 0.01,MAR - 0.01 - 137,-2.716528e-04
4,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674505,0.0,0.674251,...,13664.0,10.0,39366.0,0.0,10.0,,4.0,MAR - 0.01,MAR - 0.01 - 137,-5.093850e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2120,KNN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.619680,0.0,0.620028,...,12035.0,8.0,26969.0,2.0,6.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42493,-2.447951e-03
2121,Mean/Mode,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.627750,0.0,0.627879,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,5.402509e-03
2122,VAE,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625687,0.0,0.626914,...,12035.0,8.0,26969.0,2.0,6.0,,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,4.437433e-03
2123,GAIN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625497,0.0,0.626178,...,12035.0,8.0,26969.0,2.0,6.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42493,3.701885e-03


In [232]:
result_improv = pd.merge(model_prediction, data_control, how="left", on=["Imputation_Method", "Missing Fraction", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures", "Missing Type"])
result_improv = result_improv[result_improv['Performance Difference to Average Best'].notna()]
#result_improv.to_csv('result_merge_control.csv')

result_improv

Unnamed: 0.1,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control,Task,Column,result_type,...,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,0.3,39366.0,0.0,10.0,MCAR,20,137.0,top-middle-square,downstream_performance_mean,...,34.0,BNG(tic-tac-toe),25702.0,13664.0,10.0,,2.0,MCAR - 0.3,MCAR - 0.3 - 137,0.0
1,KNN,0.3,39366.0,0.0,10.0,MNAR,20,137.0,top-middle-square,downstream_performance_mean,...,34.0,BNG(tic-tac-toe),25702.0,13664.0,10.0,,4.0,MNAR - 0.3,MNAR - 0.3 - 137,-0.001647584
2,Discriminative DL,0.3,45312.0,7.0,2.0,MCAR,20,151.0,nswprice,downstream_performance_mean,...,37.0,electricity,26075.0,19237.0,9.0,,3.0,MCAR - 0.3,MCAR - 0.3 - 151,-0.007765528
3,Discriminative DL,0.5,45312.0,7.0,2.0,MCAR,20,151.0,nswprice,downstream_performance_mean,...,37.0,electricity,26075.0,19237.0,9.0,,3.0,MCAR - 0.5,MCAR - 0.5 - 151,-0.02230848
4,Discriminative DL,0.5,39366.0,9.0,1.0,MCAR,20,251.0,Cell_Shape_Uniformity,downstream_performance_mean,...,35.0,BNG(breast-w),25820.0,13546.0,10.0,,3.0,MCAR - 0.5,MCAR - 0.5 - 251,-0.0006800371
5,Mean/Mode,0.01,11183.0,6.0,1.0,MAR,20,310.0,attr2,downstream_performance_mean,...,12.0,mammography,10923.0,260.0,7.0,,4.0,MAR - 0.01,MAR - 0.01 - 310,-0.01163999
6,VAE,0.5,8192.0,8.0,1.0,MNAR,20,725.0,a2pop,downstream_performance_mean,...,9.0,bank8FM,4885.0,3307.0,9.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 725,0.08407018
7,VAE,0.5,3107.0,6.0,1.0,MCAR,20,737.0,INCOME,downstream_performance_mean,...,0.0,space_ga,1566.0,1541.0,7.0,,1.0,MCAR - 0.5,MCAR - 0.5 - 737,0.005708595
8,Mean/Mode,0.1,3107.0,6.0,1.0,MNAR,20,737.0,INCOME,downstream_performance_mean,...,0.0,space_ga,1566.0,1541.0,7.0,,2.0,MNAR - 0.1,MNAR - 0.1 - 737,0.002712497
9,Discriminative DL,0.5,20640.0,8.0,1.0,MCAR,20,823.0,total_rooms,downstream_performance_mean,...,24.0,houses,11726.0,8914.0,9.0,,1.0,MCAR - 0.5,MCAR - 0.5 - 823,0.0006378209


In [233]:
av_improv_model = result_improv['Performance Difference to Average Best'].mean()
print(av_improv_model)

0.004139915566466067


## Experiment Results adjusted for 0.01 F1 Score Points

For the training we use the experiment results, but replaced the best method for each data constellation with the average best method, if the average best method is not outscored by at least 0.01 F1 score points.

### Training

In [150]:
data = pd.read_csv('binary_properties_train_dataset_1_percent.csv')
#data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0.1,Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,999,Random Forest,MAR,0.01,15545.0,5.0,1.0
1,1071,Random Forest,MAR,0.01,19020.0,11.0,1.0
2,1140,Random Forest,MAR,0.01,39948.0,9.0,1.0
3,4,Random Forest,MAR,0.01,39366.0,0.0,10.0
4,1209,Random Forest,MAR,0.01,14980.0,14.0,1.0
...,...,...,...,...,...,...,...
355,711,Random Forest,MNAR,0.50,6574.0,14.0,1.0
356,783,Random Forest,MNAR,0.50,3848.0,5.0,1.0
357,849,Random Forest,MNAR,0.50,40768.0,7.0,4.0
358,920,Random Forest,MNAR,0.50,40768.0,10.0,1.0


In [151]:
# OneHot-encoder -> multiple column approach
# 

transformer = make_column_transformer(
    (OneHotEncoder(), ['Imputation_Method', 'Missing Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
data_preprocessed = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
data_preprocessed


Unnamed: 0,onehotencoder__Imputation_Method_Discriminative DL,onehotencoder__Imputation_Method_GAIN,onehotencoder__Imputation_Method_KNN,onehotencoder__Imputation_Method_Mean/Mode,onehotencoder__Imputation_Method_Random Forest,onehotencoder__Imputation_Method_VAE,onehotencoder__Missing Type_MAR,onehotencoder__Missing Type_MCAR,onehotencoder__Missing Type_MNAR,remainder__Unnamed: 0,remainder__Missing Fraction,remainder__NumberOfInstances,remainder__NumberOfNumericFeatures,remainder__NumberOfCategoricalFeatures
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,999.0,0.01,15545.0,5.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1071.0,0.01,19020.0,11.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1140.0,0.01,39948.0,9.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0,0.01,39366.0,0.0,10.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1209.0,0.01,14980.0,14.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,711.0,0.50,6574.0,14.0,1.0
356,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,783.0,0.50,3848.0,5.0,1.0
357,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,849.0,0.50,40768.0,7.0,4.0
358,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,920.0,0.50,40768.0,10.0,1.0


In [152]:
# Umwandlung in numpy arrays
X = data_preprocessed[['onehotencoder__Missing Type_MAR', 'onehotencoder__Missing Type_MCAR', 
                         'onehotencoder__Missing Type_MNAR', 'remainder__Missing Fraction', 
                         'remainder__NumberOfInstances',  
                         'remainder__NumberOfNumericFeatures', 'remainder__NumberOfCategoricalFeatures']].to_numpy()
#X = data_preprocessed_x
y = data[['Imputation_Method']].to_numpy()


# Aufteilung in train und test Daten --> 20 % Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train.shape
#print(X_train)

(288, 7)

In [153]:
y_train.shape

(288, 1)

In [154]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(y)



[[-0.67419986 -0.72932496  1.39239919 ...  0.19532753 -1.28703691
   1.56988381]
 [-0.67419986 -0.72932496  1.39239919 ... -0.61641947  1.69286453
  -0.46672221]
 [-0.67419986  1.37113092 -0.71818485 ... -0.75529876 -0.45928651
  -0.46672221]
 ...
 [-0.67419986 -0.72932496  1.39239919 ... -0.27855451 -0.79038667
  -0.46672221]
 [-0.67419986 -0.72932496  1.39239919 ...  0.32105726  2.18951478
  -0.46672221]
 [ 1.4832397  -0.72932496 -0.71818485 ... -0.45949582 -0.62483659
  -0.46672221]]
[['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Ra

In [155]:
X_train.shape

(288, 7)

In [156]:
y.shape

(360, 1)

### Random Forest with Nested Cross Validation, with GridSearch

In [163]:
X_train = X
y_train = y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
outer_results_1 = list()
predicitons_list_1 = list()
test_data_list_1 = list()
ground_truth_list_1 = list()

inner_nested_cv_results= []
inner_nested_best_score= []
inner_nested_best_params= []
inner_nested_test_score = []

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)


    
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')#, class_weight='balanced')

    param_grid = {
        'n_estimators': [25, 100, 500, 1000, 3000, 20000],# 10000],#, 3000, 4000, 5000, 6000, 7000],
#    'max_features': ['sqrt', 'log2'],
#    'criterion' :['gini', 'entropy'],
        'max_depth': [1,9,12,15,40]#,3,4,5,6,7]
    }

    grid_clf = GridSearchCV(clf, param_grid, cv=inner_cv)
    result = grid_clf.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_
    print(grid_clf.best_params_)
    print(best_model)

    inner_nested_cv_results.append(grid_clf.cv_results_)
    inner_nested_best_score.append(grid_clf.best_score_)
    inner_nested_best_params.append(grid_clf.best_params_)
    inner_nested_test_score.append(grid_clf.score(X_test, y_test))

    yhat = best_model.predict(X_test)
#    print(yhat)
#    print('___________________----------\n')
#    print(X_test)
#    print('___________________----------\n')
#    print(y_test)
    acc = accuracy_score(y_test, yhat)
    outer_results_1.append(acc)
    predicitons_list_1.append(yhat)
    test_data_list_1.append(X_test)
    ground_truth_list_1.append(y_test)
    
   
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print('_____________________________')
    print('\n')
    
print('Accuracy: %.3f (%.3f)' % (mean(outer_results_1), std(outer_results_1)))    



{'max_depth': 12, 'n_estimators': 1000}
RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=1000,
                       random_state=42)
>acc=0.750, est=0.827, cfg={'max_depth': 12, 'n_estimators': 1000}
_____________________________


{'max_depth': 12, 'n_estimators': 500}
RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=500,
                       random_state=42)
>acc=0.694, est=0.824, cfg={'max_depth': 12, 'n_estimators': 500}
_____________________________


{'max_depth': 15, 'n_estimators': 100}
RandomForestClassifier(class_weight='balanced', max_depth=15, random_state=42)
>acc=0.778, est=0.821, cfg={'max_depth': 15, 'n_estimators': 100}
_____________________________


{'max_depth': 9, 'n_estimators': 500}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=500,
                       random_state=42)
>acc=0.861, est=0.815, cfg={'max_depth': 9, 'n_estimators': 500}
_____________________________


{'max_depth':

In [164]:
print(type(yhat))
print(type(X_test))
print(type(y_test))
print(outer_results_1)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.75, 0.6944444444444444, 0.7777777777777778, 0.8611111111111112, 0.8333333333333334, 0.8333333333333334, 0.8611111111111112, 0.8333333333333334, 0.8055555555555556, 0.8888888888888888]


In [165]:
#np.mean(inner_nested_test_score)

In [166]:
#inner_nested_cv_results

In [167]:
inner_nested_best_params

[{'max_depth': 12, 'n_estimators': 1000},
 {'max_depth': 12, 'n_estimators': 500},
 {'max_depth': 15, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 500},
 {'max_depth': 9, 'n_estimators': 1000},
 {'max_depth': 9, 'n_estimators': 100},
 {'max_depth': 12, 'n_estimators': 3000},
 {'max_depth': 12, 'n_estimators': 1000},
 {'max_depth': 9, 'n_estimators': 25},
 {'max_depth': 12, 'n_estimators': 20000}]

In [168]:
inner_nested_test_score

[0.75,
 0.6944444444444444,
 0.7777777777777778,
 0.8611111111111112,
 0.8333333333333334,
 0.8333333333333334,
 0.8611111111111112,
 0.8333333333333334,
 0.8055555555555556,
 0.8888888888888888]

In [216]:
# Manually select the right model (best accuracy)

outer_results_item = outer_results_1[9]
print(outer_results_item)
df_predict_item = predicitons_list_1[9]
#df_predict_item
test_data_item = test_data_list_1[9]
ground_truth_item = ground_truth_list_1[9]

0.8888888888888888


In [217]:
df_predict = pd.DataFrame()
df_predict['prediction'] = df_predict_item
df_predict

Unnamed: 0,prediction
0,Random Forest
1,Random Forest
2,Random Forest
3,Random Forest
4,Random Forest
5,Random Forest
6,Random Forest
7,Random Forest
8,VAE
9,Random Forest


In [218]:
df_testdata = pd.DataFrame({'MAR': test_data_item[:, 0], 'MCAR': test_data_item[:, 1], 'MNAR': test_data_item[:, 2], 'Missing Fraction': test_data_item[:, 3], 'NumberOfInstances': test_data_item[:, 4],'NumberOfNumericFeatures': test_data_item[:, 5], 'NumberOfCategoricalFeatures': test_data_item[:, 6]})
df_testdata


Unnamed: 0,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,1.0,0.0,0.0,0.01,8192.0,8.0,1.0
1,1.0,0.0,0.0,0.01,4052.0,7.0,1.0
2,1.0,0.0,0.0,0.1,30000.0,23.0,1.0
3,1.0,0.0,0.0,0.1,8192.0,8.0,1.0
4,1.0,0.0,0.0,0.1,20640.0,8.0,1.0
5,1.0,0.0,0.0,0.1,40768.0,10.0,1.0
6,1.0,0.0,0.0,0.3,39366.0,9.0,1.0
7,1.0,0.0,0.0,0.3,40768.0,7.0,4.0
8,1.0,0.0,0.0,0.5,4521.0,7.0,10.0
9,1.0,0.0,0.0,0.5,11183.0,6.0,1.0


In [219]:
model_prediction = pd.DataFrame()
model_prediction = pd.concat([df_predict, df_testdata], axis=1)

#model_prediction.to_csv('model_prediction.csv', index=False)
model_prediction

Unnamed: 0,prediction,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Random Forest,1.0,0.0,0.0,0.01,8192.0,8.0,1.0
1,Random Forest,1.0,0.0,0.0,0.01,4052.0,7.0,1.0
2,Random Forest,1.0,0.0,0.0,0.1,30000.0,23.0,1.0
3,Random Forest,1.0,0.0,0.0,0.1,8192.0,8.0,1.0
4,Random Forest,1.0,0.0,0.0,0.1,20640.0,8.0,1.0
5,Random Forest,1.0,0.0,0.0,0.1,40768.0,10.0,1.0
6,Random Forest,1.0,0.0,0.0,0.3,39366.0,9.0,1.0
7,Random Forest,1.0,0.0,0.0,0.3,40768.0,7.0,4.0
8,VAE,1.0,0.0,0.0,0.5,4521.0,7.0,10.0
9,Random Forest,1.0,0.0,0.0,0.5,11183.0,6.0,1.0


In [220]:
one_hot_reverse = model_prediction.copy()
one_hot_reverse = one_hot_reverse[['MAR', 'MCAR', 'MNAR']]

one_hot_reverse['Missing Type'] = one_hot_reverse.idxmax(1)
one_hot_reverse = one_hot_reverse[['Missing Type']]
one_hot_reverse

Unnamed: 0,Missing Type
0,MAR
1,MAR
2,MAR
3,MAR
4,MAR
5,MAR
6,MAR
7,MAR
8,MAR
9,MAR


In [221]:
model_prediction = pd.concat([model_prediction, one_hot_reverse], axis=1)
model_prediction = model_prediction.drop(['MAR', 'MCAR', 'MNAR'], axis=1)
model_prediction['control'] = 20
model_prediction = model_prediction.rename(columns={"prediction": "Imputation_Method"})

model_prediction

Unnamed: 0,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control
0,Random Forest,0.01,8192.0,8.0,1.0,MAR,20
1,Random Forest,0.01,4052.0,7.0,1.0,MAR,20
2,Random Forest,0.1,30000.0,23.0,1.0,MAR,20
3,Random Forest,0.1,8192.0,8.0,1.0,MAR,20
4,Random Forest,0.1,20640.0,8.0,1.0,MAR,20
5,Random Forest,0.1,40768.0,10.0,1.0,MAR,20
6,Random Forest,0.3,39366.0,9.0,1.0,MAR,20
7,Random Forest,0.3,40768.0,7.0,4.0,MAR,20
8,VAE,0.5,4521.0,7.0,10.0,MAR,20
9,Random Forest,0.5,11183.0,6.0,1.0,MAR,20


In [222]:
data_control = pd.read_csv('binary_imputed_full_info.csv')
#data_multi = pd.read_csv('multi_imputed_full_info.csv')

data_control

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000000e+00
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674760,...,13664.0,10.0,39366.0,0.0,10.0,,2.0,MAR - 0.01,MAR - 0.01 - 137,-2.949553e-08
2,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674145,0.0,0.674116,...,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.01,MAR - 0.01 - 137,-6.439471e-04
3,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674488,...,13664.0,10.0,39366.0,0.0,10.0,,3.0,MAR - 0.01,MAR - 0.01 - 137,-2.716528e-04
4,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674505,0.0,0.674251,...,13664.0,10.0,39366.0,0.0,10.0,,4.0,MAR - 0.01,MAR - 0.01 - 137,-5.093850e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2120,KNN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.619680,0.0,0.620028,...,12035.0,8.0,26969.0,2.0,6.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42493,-2.447951e-03
2121,Mean/Mode,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.627750,0.0,0.627879,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,5.402509e-03
2122,VAE,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625687,0.0,0.626914,...,12035.0,8.0,26969.0,2.0,6.0,,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,4.437433e-03
2123,GAIN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625497,0.0,0.626178,...,12035.0,8.0,26969.0,2.0,6.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42493,3.701885e-03


In [223]:
result_improv = pd.merge(model_prediction, data_control, how="left", on=["Imputation_Method", "Missing Fraction", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures", "Missing Type"])
result_improv = result_improv[result_improv['Performance Difference to Average Best'].notna()]
#result_improv.to_csv('result_merge_control.csv')

result_improv

Unnamed: 0.1,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control,Task,Column,result_type,...,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,0.01,8192.0,8.0,1.0,MAR,20,725,a2pop,downstream_performance_mean,...,9,bank8FM,4885.0,3307.0,9.0,,4.0,MAR - 0.01,MAR - 0.01 - 725,0.0
1,Random Forest,0.01,4052.0,7.0,1.0,MAR,20,728,Lower_court_disagreement,downstream_performance_mean,...,3,analcatdata_supreme,3081.0,971.0,8.0,,3.0,MAR - 0.01,MAR - 0.01 - 728,0.0
2,Random Forest,0.1,30000.0,23.0,1.0,MAR,20,42477,x1,downstream_performance_mean,...,42,default-of-credit-card-clients,23364.0,6636.0,24.0,,6.0,MAR - 0.1,MAR - 0.1 - 42477,0.0
3,Random Forest,0.1,8192.0,8.0,1.0,MAR,20,725,a2pop,downstream_performance_mean,...,9,bank8FM,4885.0,3307.0,9.0,,1.0,MAR - 0.1,MAR - 0.1 - 725,0.0
4,Random Forest,0.1,20640.0,8.0,1.0,MAR,20,823,total_rooms,downstream_performance_mean,...,24,houses,11726.0,8914.0,9.0,,4.0,MAR - 0.1,MAR - 0.1 - 823,0.0
5,Random Forest,0.1,40768.0,10.0,1.0,MAR,20,901,X2,downstream_performance_mean,...,38,fried,20427.0,20341.0,11.0,,2.0,MAR - 0.1,MAR - 0.1 - 901,0.0
6,Random Forest,0.3,39366.0,9.0,1.0,MAR,20,251,Cell_Shape_Uniformity,downstream_performance_mean,...,35,BNG(breast-w),25820.0,13546.0,10.0,,1.0,MAR - 0.3,MAR - 0.3 - 251,0.0
7,Random Forest,0.3,40768.0,7.0,4.0,MAR,20,881,x3,downstream_performance_mean,...,39,mv,24321.0,16447.0,11.0,,4.0,MAR - 0.3,MAR - 0.3 - 881,0.0
8,VAE,0.5,4521.0,7.0,10.0,MAR,20,1558,V7,downstream_performance_mean,...,11,bank-marketing,4000.0,521.0,17.0,,2.0,MAR - 0.5,MAR - 0.5 - 1558,0.013527
9,Random Forest,0.5,11183.0,6.0,1.0,MAR,20,310,attr2,downstream_performance_mean,...,12,mammography,10923.0,260.0,7.0,,5.0,MAR - 0.5,MAR - 0.5 - 310,0.0


In [224]:
av_improv_model = result_improv['Performance Difference to Average Best'].mean()
print(av_improv_model)

0.002764540164239589


## Experiment Results adjusted for 0.03 F1 Score Points

For the training we use the experiment results, but replaced the best method for each data constellation with the average best method, if the average best method is not outscored by at least 0.03 F1 score points.

### Training

In [178]:
data = pd.read_csv('binary_properties_train_dataset_3_percent.csv')
#data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0.1,Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,999,Random Forest,MAR,0.01,15545.0,5.0,1.0
1,1071,Random Forest,MAR,0.01,19020.0,11.0,1.0
2,1140,Random Forest,MAR,0.01,39948.0,9.0,1.0
3,4,Random Forest,MAR,0.01,39366.0,0.0,10.0
4,1209,Random Forest,MAR,0.01,14980.0,14.0,1.0
...,...,...,...,...,...,...,...
355,711,Random Forest,MNAR,0.50,6574.0,14.0,1.0
356,783,Random Forest,MNAR,0.50,3848.0,5.0,1.0
357,849,Random Forest,MNAR,0.50,40768.0,7.0,4.0
358,920,Random Forest,MNAR,0.50,40768.0,10.0,1.0


In [179]:
# OneHot-encoder -> multiple column approach
# 

transformer = make_column_transformer(
    (OneHotEncoder(), ['Imputation_Method', 'Missing Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
data_preprocessed = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
data_preprocessed


Unnamed: 0,onehotencoder__Imputation_Method_Discriminative DL,onehotencoder__Imputation_Method_GAIN,onehotencoder__Imputation_Method_KNN,onehotencoder__Imputation_Method_Mean/Mode,onehotencoder__Imputation_Method_Random Forest,onehotencoder__Imputation_Method_VAE,onehotencoder__Missing Type_MAR,onehotencoder__Missing Type_MCAR,onehotencoder__Missing Type_MNAR,remainder__Unnamed: 0,remainder__Missing Fraction,remainder__NumberOfInstances,remainder__NumberOfNumericFeatures,remainder__NumberOfCategoricalFeatures
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,999.0,0.01,15545.0,5.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1071.0,0.01,19020.0,11.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1140.0,0.01,39948.0,9.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0,0.01,39366.0,0.0,10.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1209.0,0.01,14980.0,14.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,711.0,0.50,6574.0,14.0,1.0
356,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,783.0,0.50,3848.0,5.0,1.0
357,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,849.0,0.50,40768.0,7.0,4.0
358,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,920.0,0.50,40768.0,10.0,1.0


In [180]:
# Umwandlung in numpy arrays
X = data_preprocessed[['onehotencoder__Missing Type_MAR', 'onehotencoder__Missing Type_MCAR', 
                         'onehotencoder__Missing Type_MNAR', 'remainder__Missing Fraction', 
                         'remainder__NumberOfInstances',  
                         'remainder__NumberOfNumericFeatures', 'remainder__NumberOfCategoricalFeatures']].to_numpy()
#X = data_preprocessed_x
y = data[['Imputation_Method']].to_numpy()


# train/test split --> 20 % Testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train.shape
#print(X_train)

(288, 7)

In [181]:
y_train.shape

(288, 1)

In [182]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(y)



[[-0.67419986 -0.72932496  1.39239919 ...  0.19532753 -1.28703691
   1.56988381]
 [-0.67419986 -0.72932496  1.39239919 ... -0.61641947  1.69286453
  -0.46672221]
 [-0.67419986  1.37113092 -0.71818485 ... -0.75529876 -0.45928651
  -0.46672221]
 ...
 [-0.67419986 -0.72932496  1.39239919 ... -0.27855451 -0.79038667
  -0.46672221]
 [-0.67419986 -0.72932496  1.39239919 ...  0.32105726  2.18951478
  -0.46672221]
 [ 1.4832397  -0.72932496 -0.71818485 ... -0.45949582 -0.62483659
  -0.46672221]]
[['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Ra

In [183]:
X_train.shape

(288, 7)

In [184]:
y.shape

(360, 1)

### Random Forest with Nested Cross Validation, with GridSearch

In [191]:
X_train = X
y_train = y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
outer_results_3 = list()
predicitons_list_3 = list()
test_data_list_3 = list()
ground_truth_list_3 = list()

inner_nested_cv_results= []
inner_nested_best_score= []
inner_nested_best_params= []
inner_nested_test_score = []

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [25, 100, 500, 1000, 20000],#, 3000, 4000, 5000, 6000, 7000],
#    'max_features': ['sqrt', 'log2'],
#    'criterion' :['gini', 'entropy'],
        'max_depth': [1,9,12,15,40]#,3,4,5,6,7]
    }

    grid_clf = GridSearchCV(clf, param_grid, cv=inner_cv)
    result = grid_clf.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_
    print(grid_clf.best_params_)
    print(best_model)

    inner_nested_cv_results.append(grid_clf.cv_results_)
    inner_nested_best_score.append(grid_clf.best_score_)
    inner_nested_best_params.append(grid_clf.best_params_)
    inner_nested_test_score.append(grid_clf.score(X_test, y_test))

    yhat = best_model.predict(X_test)
#    print(yhat)
#    print('___________________----------\n')
#    print(X_test)
#    print('___________________----------\n')
#    print(y_test)
    acc = accuracy_score(y_test, yhat)
    outer_results_3.append(acc)
    predicitons_list_3.append(yhat)
    test_data_list_3.append(X_test)
    ground_truth_list_3.append(y_test)
    
   
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print('_____________________________')
    print('\n')
    
print('Accuracy: %.3f (%.3f)' % (mean(outer_results_3), std(outer_results_3)))    

{'max_depth': 9, 'n_estimators': 25}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=25,
                       random_state=42)
>acc=0.889, est=0.911, cfg={'max_depth': 9, 'n_estimators': 25}
_____________________________


{'max_depth': 9, 'n_estimators': 500}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=500,
                       random_state=42)
>acc=0.917, est=0.910, cfg={'max_depth': 9, 'n_estimators': 500}
_____________________________


{'max_depth': 9, 'n_estimators': 25}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=25,
                       random_state=42)
>acc=0.861, est=0.911, cfg={'max_depth': 9, 'n_estimators': 25}
_____________________________


{'max_depth': 9, 'n_estimators': 25}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=25,
                       random_state=42)
>acc=0.944, est=0.901, cfg={'max_depth': 9, 'n_estimators': 25}
_________________________

In [192]:
print(type(yhat))
print(type(X_test))
print(type(y_test))
print(outer_results_3)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.8888888888888888, 0.9166666666666666, 0.8611111111111112, 0.9444444444444444, 0.9166666666666666, 0.8611111111111112, 0.9722222222222222, 0.9722222222222222, 0.8888888888888888, 0.8888888888888888]


In [193]:
#np.mean(inner_nested_test_score)

In [194]:
inner_nested_cv_results

[{'mean_fit_time': array([ 0.04120975,  0.14562993,  0.70155973,  1.40711985, 27.92376227,
          0.03820882,  0.15883617,  0.74517293,  1.49434004, 29.61074519,
          0.03800879,  0.14883404,  0.73496714,  1.46473365, 29.7328548 ,
          0.03880582,  0.14883056,  0.73736806,  1.50654335, 29.74097843,
          0.03801165,  0.14883699,  0.73856931,  1.47433619, 29.9044219 ]),
  'std_fit_time': array([5.84595028e-03, 7.09055164e-03, 7.65907986e-03, 9.51896526e-03,
         1.00850906e-01, 4.00137997e-04, 1.47460824e-02, 1.38056785e-02,
         2.53754523e-02, 1.66815797e-01, 2.43140197e-07, 1.32702107e-03,
         3.48784277e-03, 7.11793990e-03, 1.96458332e-01, 3.98846125e-04,
         1.16376222e-03, 3.86875609e-03, 7.04977131e-02, 2.15286790e-01,
         6.03458100e-06, 7.49283172e-04, 4.02756022e-03, 8.92306366e-03,
         2.21494145e-01]),
  'mean_score_time': array([0.00320067, 0.01060538, 0.04881096, 0.09762244, 2.09027576,
         0.00320048, 0.01240253, 0.0510087

In [195]:
inner_nested_best_params

[{'max_depth': 9, 'n_estimators': 25},
 {'max_depth': 9, 'n_estimators': 500},
 {'max_depth': 9, 'n_estimators': 25},
 {'max_depth': 9, 'n_estimators': 25},
 {'max_depth': 12, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 500},
 {'max_depth': 12, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 500},
 {'max_depth': 9, 'n_estimators': 25},
 {'max_depth': 12, 'n_estimators': 500}]

In [196]:
inner_nested_test_score

[0.8888888888888888,
 0.9166666666666666,
 0.8611111111111112,
 0.9444444444444444,
 0.9166666666666666,
 0.8611111111111112,
 0.9722222222222222,
 0.9722222222222222,
 0.8888888888888888,
 0.8888888888888888]

In [207]:
# Manually select the right model (best accuracy)

outer_results_item = outer_results_3[6]
print(outer_results_item)
df_predict_item = predicitons_list_3[6]
#df_predict_item
test_data_item = test_data_list_3[6]
ground_truth_item = ground_truth_list_3[6]

0.9722222222222222


In [208]:
df_predict = pd.DataFrame()
df_predict['prediction'] = df_predict_item
df_predict

Unnamed: 0,prediction
0,Random Forest
1,Random Forest
2,Random Forest
3,Random Forest
4,Random Forest
5,Random Forest
6,Random Forest
7,Random Forest
8,Random Forest
9,Random Forest


In [209]:
df_testdata = pd.DataFrame({'MAR': test_data_item[:, 0], 'MCAR': test_data_item[:, 1], 'MNAR': test_data_item[:, 2], 'Missing Fraction': test_data_item[:, 3], 'NumberOfInstances': test_data_item[:, 4],'NumberOfNumericFeatures': test_data_item[:, 5], 'NumberOfCategoricalFeatures': test_data_item[:, 6]})
df_testdata


Unnamed: 0,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,1.0,0.0,0.0,0.01,14980.0,14.0,1.0
1,1.0,0.0,0.0,0.01,11183.0,6.0,1.0
2,1.0,0.0,0.0,0.01,3848.0,5.0,1.0
3,1.0,0.0,0.0,0.01,40768.0,10.0,1.0
4,1.0,0.0,0.0,0.1,5404.0,5.0,1.0
5,1.0,0.0,0.0,0.1,39366.0,9.0,1.0
6,1.0,0.0,0.0,0.1,88588.0,6.0,1.0
7,1.0,0.0,0.0,0.1,4052.0,7.0,1.0
8,1.0,0.0,0.0,0.3,5404.0,5.0,1.0
9,1.0,0.0,0.0,0.3,96320.0,21.0,1.0


In [210]:
model_prediction = pd.DataFrame()
model_prediction = pd.concat([df_predict, df_testdata], axis=1)
#model_prediction = pd.concat([model_prediction, df_ground], axis=1)

#model_prediction.to_csv('model_prediction.csv', index=False)
model_prediction

Unnamed: 0,prediction,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Random Forest,1.0,0.0,0.0,0.01,14980.0,14.0,1.0
1,Random Forest,1.0,0.0,0.0,0.01,11183.0,6.0,1.0
2,Random Forest,1.0,0.0,0.0,0.01,3848.0,5.0,1.0
3,Random Forest,1.0,0.0,0.0,0.01,40768.0,10.0,1.0
4,Random Forest,1.0,0.0,0.0,0.1,5404.0,5.0,1.0
5,Random Forest,1.0,0.0,0.0,0.1,39366.0,9.0,1.0
6,Random Forest,1.0,0.0,0.0,0.1,88588.0,6.0,1.0
7,Random Forest,1.0,0.0,0.0,0.1,4052.0,7.0,1.0
8,Random Forest,1.0,0.0,0.0,0.3,5404.0,5.0,1.0
9,Random Forest,1.0,0.0,0.0,0.3,96320.0,21.0,1.0


In [211]:
one_hot_reverse = model_prediction.copy()
one_hot_reverse = one_hot_reverse[['MAR', 'MCAR', 'MNAR']]

one_hot_reverse['Missing Type'] = one_hot_reverse.idxmax(1)
one_hot_reverse = one_hot_reverse[['Missing Type']]
one_hot_reverse

Unnamed: 0,Missing Type
0,MAR
1,MAR
2,MAR
3,MAR
4,MAR
5,MAR
6,MAR
7,MAR
8,MAR
9,MAR


In [212]:
model_prediction = pd.concat([model_prediction, one_hot_reverse], axis=1)
model_prediction = model_prediction.drop(['MAR', 'MCAR', 'MNAR'], axis=1)
model_prediction['control'] = 20
model_prediction = model_prediction.rename(columns={"prediction": "Imputation_Method"})

model_prediction

Unnamed: 0,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control
0,Random Forest,0.01,14980.0,14.0,1.0,MAR,20
1,Random Forest,0.01,11183.0,6.0,1.0,MAR,20
2,Random Forest,0.01,3848.0,5.0,1.0,MAR,20
3,Random Forest,0.01,40768.0,10.0,1.0,MAR,20
4,Random Forest,0.1,5404.0,5.0,1.0,MAR,20
5,Random Forest,0.1,39366.0,9.0,1.0,MAR,20
6,Random Forest,0.1,88588.0,6.0,1.0,MAR,20
7,Random Forest,0.1,4052.0,7.0,1.0,MAR,20
8,Random Forest,0.3,5404.0,5.0,1.0,MAR,20
9,Random Forest,0.3,96320.0,21.0,1.0,MAR,20


In [213]:
data_control = pd.read_csv('binary_imputed_full_info.csv')
#data_multi = pd.read_csv('multi_imputed_full_info.csv')

data_control

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674909,0.0,0.674760,...,13664.0,10.0,39366.0,0.0,10.0,,1.0,MAR - 0.01,MAR - 0.01 - 137,0.000000e+00
1,KNN,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674760,...,13664.0,10.0,39366.0,0.0,10.0,,2.0,MAR - 0.01,MAR - 0.01 - 137,-2.949553e-08
2,Mean/Mode,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674145,0.0,0.674116,...,13664.0,10.0,39366.0,0.0,10.0,,5.0,MAR - 0.01,MAR - 0.01 - 137,-6.439471e-04
3,VAE,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674729,0.0,0.674488,...,13664.0,10.0,39366.0,0.0,10.0,,3.0,MAR - 0.01,MAR - 0.01 - 137,-2.716528e-04
4,Discriminative DL,137,MAR,0.01,top-middle-square,downstream_performance_mean,Classification Tasks,0.674505,0.0,0.674251,...,13664.0,10.0,39366.0,0.0,10.0,,4.0,MAR - 0.01,MAR - 0.01 - 137,-5.093850e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2120,KNN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.619680,0.0,0.620028,...,12035.0,8.0,26969.0,2.0,6.0,,6.0,MNAR - 0.5,MNAR - 0.5 - 42493,-2.447951e-03
2121,Mean/Mode,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.627750,0.0,0.627879,...,12035.0,8.0,26969.0,2.0,6.0,,1.0,MNAR - 0.5,MNAR - 0.5 - 42493,5.402509e-03
2122,VAE,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625687,0.0,0.626914,...,12035.0,8.0,26969.0,2.0,6.0,,2.0,MNAR - 0.5,MNAR - 0.5 - 42493,4.437433e-03
2123,GAIN,42493,MNAR,0.50,Length,downstream_performance_mean,Classification Tasks,0.625497,0.0,0.626178,...,12035.0,8.0,26969.0,2.0,6.0,,3.0,MNAR - 0.5,MNAR - 0.5 - 42493,3.701885e-03


In [214]:
result_improv = pd.merge(model_prediction, data_control, how="left", on=["Imputation_Method", "Missing Fraction", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures", "Missing Type"])
result_improv = result_improv[result_improv['Performance Difference to Average Best'].notna()]
#result_improv.to_csv('result_merge_control.csv')

result_improv

Unnamed: 0.1,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control,Task,Column,result_type,...,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,0.01,14980.0,14.0,1.0,MAR,20,1471,V9,downstream_performance_mean,...,28,eeg-eye-state,8257.0,6723.0,15.0,,3.0,MAR - 0.01,MAR - 0.01 - 1471,0.0
1,Random Forest,0.01,11183.0,6.0,1.0,MAR,20,310,attr2,downstream_performance_mean,...,12,mammography,10923.0,260.0,7.0,,2.0,MAR - 0.01,MAR - 0.01 - 310,0.0
2,Random Forest,0.01,3848.0,5.0,1.0,MAR,20,871,RIDGE,downstream_performance_mean,...,1,pollen,1924.0,1924.0,6.0,,6.0,MAR - 0.01,MAR - 0.01 - 871,0.0
3,Random Forest,0.01,40768.0,10.0,1.0,MAR,20,901,X2,downstream_performance_mean,...,38,fried,20427.0,20341.0,11.0,,3.0,MAR - 0.01,MAR - 0.01 - 901,0.0
4,Random Forest,0.1,5404.0,5.0,1.0,MAR,20,1489,V1,downstream_performance_mean,...,4,phoneme,3818.0,1586.0,6.0,,2.0,MAR - 0.1,MAR - 0.1 - 1489,0.0
5,Random Forest,0.1,39366.0,9.0,1.0,MAR,20,251,Cell_Shape_Uniformity,downstream_performance_mean,...,35,BNG(breast-w),25820.0,13546.0,10.0,,4.0,MAR - 0.1,MAR - 0.1 - 251,0.0
6,Random Forest,0.1,88588.0,6.0,1.0,MAR,20,40922,gyro_y,downstream_performance_mean,...,41,Run_or_walk_information,44365.0,44223.0,7.0,,3.0,MAR - 0.1,MAR - 0.1 - 40922,0.0
7,Random Forest,0.1,4052.0,7.0,1.0,MAR,20,728,Lower_court_disagreement,downstream_performance_mean,...,3,analcatdata_supreme,3081.0,971.0,8.0,,1.0,MAR - 0.1,MAR - 0.1 - 728,0.0
8,Random Forest,0.3,5404.0,5.0,1.0,MAR,20,1489,V1,downstream_performance_mean,...,4,phoneme,3818.0,1586.0,6.0,,5.0,MAR - 0.3,MAR - 0.3 - 1489,0.0
9,Random Forest,0.3,96320.0,21.0,1.0,MAR,20,23517,attribute_10,downstream_performance_mean,...,44,numerai28.6,48658.0,47662.0,22.0,,1.0,MAR - 0.3,MAR - 0.3 - 23517,0.0


In [215]:
av_improv_model = result_improv['Performance Difference to Average Best'].mean()
print(av_improv_model)

0.0
