In [1]:
#cleaned

# Imputation Method Selection based on Dataset Properties for Multiclass Classification

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns
from pandas.api.types import CategoricalDtype
from pathlib import Path

from numpy import mean
from numpy import std

import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr

from sklearn.compose import make_column_transformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import tensorflow as tf

import csv
from sklearn.utils import Bunch
from sklearn import metrics
from math import sqrt

from numpy import loadtxt
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from keras.utils import to_categorical 

## Training

In [3]:
data = pd.read_csv('multi_properties_train_dataset_original.csv')

data

Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Discriminative DL,MAR,0.01,20000.0,16.0,1.0
1,KNN,MAR,0.10,20000.0,16.0,1.0
2,KNN,MAR,0.30,20000.0,16.0,1.0
3,Random Forest,MAR,0.50,20000.0,16.0,1.0
4,Mean/Mode,MCAR,0.01,20000.0,16.0,1.0
...,...,...,...,...,...,...
199,GAIN,MCAR,0.50,20000.0,20.0,1.0
200,KNN,MNAR,0.01,20000.0,20.0,1.0
201,Discriminative DL,MNAR,0.10,20000.0,20.0,1.0
202,Mean/Mode,MNAR,0.30,20000.0,20.0,1.0


In [4]:
# OneHot-encoder -> multiple column approach
# 

transformer = make_column_transformer(
    (OneHotEncoder(), ['Imputation_Method', 'Missing Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
data_preprocessed = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
data_preprocessed


Unnamed: 0,onehotencoder__Imputation_Method_Discriminative DL,onehotencoder__Imputation_Method_GAIN,onehotencoder__Imputation_Method_KNN,onehotencoder__Imputation_Method_Mean/Mode,onehotencoder__Imputation_Method_Random Forest,onehotencoder__Imputation_Method_VAE,onehotencoder__Missing Type_MAR,onehotencoder__Missing Type_MCAR,onehotencoder__Missing Type_MNAR,remainder__Missing Fraction,remainder__NumberOfInstances,remainder__NumberOfNumericFeatures,remainder__NumberOfCategoricalFeatures
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.01,20000.0,16.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.10,20000.0,16.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.30,20000.0,16.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.50,20000.0,16.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.01,20000.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.50,20000.0,20.0,1.0
200,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.01,20000.0,20.0,1.0
201,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.10,20000.0,20.0,1.0
202,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.30,20000.0,20.0,1.0


In [5]:
# Umwandlung in numpy arrays
X = data_preprocessed[['onehotencoder__Missing Type_MAR', 'onehotencoder__Missing Type_MCAR', 
                         'onehotencoder__Missing Type_MNAR', 'remainder__Missing Fraction', 
                         'remainder__NumberOfInstances',  
                         'remainder__NumberOfNumericFeatures', 'remainder__NumberOfCategoricalFeatures']].to_numpy()
#X = data_preprocessed_x
y = data[['Imputation_Method']].to_numpy()


# train/test split --> 20 % Testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train.shape
#print(X_train)

(163, 7)

In [6]:
y_train.shape

(163, 1)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(y)



[[-0.75319831  1.50332964 -0.7038557  ...  0.23605398  1.69921697
  -0.48688983]
 [-0.75319831 -0.66519011  1.42074576 ... -0.40228511 -0.27481853
  -0.48688983]
 [ 1.3276716  -0.66519011 -0.7038557  ... -0.35177658  1.09182143
  -0.48688983]
 ...
 [-0.75319831 -0.66519011  1.42074576 ... -0.40228511 -0.27481853
  -0.48688983]
 [-0.75319831 -0.66519011  1.42074576 ...  2.71580101  0.02887924
  -0.48688983]
 [-0.75319831  1.50332964 -0.7038557  ...  0.76176035 -0.88221407
  -0.00687143]]
[['Discriminative DL']
 ['KNN']
 ['KNN']
 ['Random Forest']
 ['Mean/Mode']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['GAIN']
 ['Mean/Mode']
 ['Random Forest']
 ['GAIN']
 ['Random Forest']
 ['Discriminative DL']
 ['VAE']
 ['Random Forest']
 ['VAE']
 ['KNN']
 ['GAIN']
 ['GAIN']
 ['GAIN']
 ['VAE']
 ['GAIN']
 ['VAE']
 ['Random Forest']
 ['Mean/Mode']
 ['VAE']
 ['GAIN']
 ['Random Forest']
 ['Discriminative DL']
 ['KNN']
 ['GAIN']
 ['Mean/Mode']


In [8]:
X_train.shape

(163, 7)

In [9]:
y.shape

(204, 1)

## Original Experiment Results

For the training we use the original experiment results here

### Random Forest with Nested Cross Validation, with GridSearch

In [16]:
X_train = X
y_train = y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
outer_results = list()
predicitons_list = list()
test_data_list = list()
ground_truth_list = list()

inner_nested_cv_results= []
inner_nested_best_score= []
inner_nested_best_params= []
inner_nested_test_score = []

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [50, 100, 250, 500, 1000, 3000, 4500, 6000],#, 3000, 4000, 5000, 6000, 7000],
#    'max_features': ['sqrt', 'log2'],
#    'criterion' :['gini', 'entropy'],
        'max_depth': [1,3,5,9,12,15]#,3,4,5,6,7]
    }

    grid_clf = GridSearchCV(clf, param_grid, cv=inner_cv)
    result = grid_clf.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_
    print(grid_clf.best_params_)
    print(best_model)

    inner_nested_cv_results.append(grid_clf.cv_results_)
    inner_nested_best_score.append(grid_clf.best_score_)
    inner_nested_best_params.append(grid_clf.best_params_)
    inner_nested_test_score.append(grid_clf.score(X_test, y_test))

    yhat = best_model.predict(X_test)
#    print(yhat)
#    print('___________________----------\n')
#    print(X_test)
#    print('___________________----------\n')
#    print(y_test)
    acc = accuracy_score(y_test, yhat)
    outer_results.append(acc)
    predicitons_list.append(yhat)
    test_data_list.append(X_test)
    ground_truth_list.append(y_test)
    
   
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print('_____________________________')
    print('\n')
    
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))    


{'max_depth': 9, 'n_estimators': 100}
RandomForestClassifier(class_weight='balanced', max_depth=9, random_state=42)
>acc=0.190, est=0.241, cfg={'max_depth': 9, 'n_estimators': 100}
_____________________________


{'max_depth': 9, 'n_estimators': 100}
RandomForestClassifier(class_weight='balanced', max_depth=9, random_state=42)
>acc=0.143, est=0.224, cfg={'max_depth': 9, 'n_estimators': 100}
_____________________________


{'max_depth': 9, 'n_estimators': 250}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=250,
                       random_state=42)
>acc=0.238, est=0.234, cfg={'max_depth': 9, 'n_estimators': 250}
_____________________________


{'max_depth': 15, 'n_estimators': 50}
RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=50,
                       random_state=42)
>acc=0.429, est=0.208, cfg={'max_depth': 15, 'n_estimators': 50}
_____________________________


{'max_depth': 9, 'n_estimators': 500}
RandomForestClassifier(class

In [17]:
print(type(yhat))
print(type(X_test))
print(type(y_test))
print(outer_results)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.19047619047619047, 0.14285714285714285, 0.23809523809523808, 0.42857142857142855, 0.25, 0.05, 0.15, 0.4, 0.15, 0.2]


In [18]:
#np.mean(inner_nested_test_score)

In [19]:
#inner_nested_cv_results

In [20]:
inner_nested_best_params

[{'max_depth': 9, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 250},
 {'max_depth': 15, 'n_estimators': 50},
 {'max_depth': 9, 'n_estimators': 500},
 {'max_depth': 12, 'n_estimators': 100},
 {'max_depth': 3, 'n_estimators': 3000},
 {'max_depth': 12, 'n_estimators': 100},
 {'max_depth': 5, 'n_estimators': 100},
 {'max_depth': 9, 'n_estimators': 1000}]

In [21]:
inner_nested_test_score

[0.19047619047619047,
 0.14285714285714285,
 0.23809523809523808,
 0.42857142857142855,
 0.25,
 0.05,
 0.15,
 0.4,
 0.15,
 0.2]

In [31]:
# Manually select the right model (best accuracy)

outer_results_item = outer_results[3]
print(outer_results_item)
df_predict_item = predicitons_list[3]
#df_predict_item
test_data_item = test_data_list[3]
ground_truth_item = ground_truth_list[3]

0.42857142857142855


In [32]:
df_predict = pd.DataFrame()
df_predict['prediction'] = df_predict_item
df_predict

Unnamed: 0,prediction
0,KNN
1,Random Forest
2,Random Forest
3,GAIN
4,Mean/Mode
5,GAIN
6,KNN
7,KNN
8,VAE
9,Discriminative DL


In [33]:
df_testdata = pd.DataFrame({'MAR': test_data_item[:, 0], 'MCAR': test_data_item[:, 1], 'MNAR': test_data_item[:, 2], 'Missing Fraction': test_data_item[:, 3], 'NumberOfInstances': test_data_item[:, 4],'NumberOfNumericFeatures': test_data_item[:, 5], 'NumberOfCategoricalFeatures': test_data_item[:, 6]})
df_testdata


Unnamed: 0,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,1.0,0.0,0.0,0.3,20000.0,16.0,1.0
1,1.0,0.0,0.0,0.01,12960.0,0.0,9.0
2,1.0,0.0,0.0,0.3,5473.0,10.0,1.0
3,0.0,1.0,0.0,0.5,5473.0,10.0,1.0
4,0.0,0.0,1.0,0.5,5473.0,10.0,1.0
5,0.0,1.0,0.0,0.1,10992.0,16.0,1.0
6,0.0,1.0,0.0,0.3,10992.0,16.0,1.0
7,0.0,0.0,1.0,0.3,10992.0,16.0,1.0
8,1.0,0.0,0.0,0.5,4177.0,7.0,2.0
9,1.0,0.0,0.0,0.5,9961.0,14.0,1.0


In [34]:
model_prediction = pd.DataFrame()
model_prediction = pd.concat([df_predict, df_testdata], axis=1)

#model_prediction.to_csv('model_prediction.csv', index=False)
model_prediction

Unnamed: 0,prediction,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,KNN,1.0,0.0,0.0,0.3,20000.0,16.0,1.0
1,Random Forest,1.0,0.0,0.0,0.01,12960.0,0.0,9.0
2,Random Forest,1.0,0.0,0.0,0.3,5473.0,10.0,1.0
3,GAIN,0.0,1.0,0.0,0.5,5473.0,10.0,1.0
4,Mean/Mode,0.0,0.0,1.0,0.5,5473.0,10.0,1.0
5,GAIN,0.0,1.0,0.0,0.1,10992.0,16.0,1.0
6,KNN,0.0,1.0,0.0,0.3,10992.0,16.0,1.0
7,KNN,0.0,0.0,1.0,0.3,10992.0,16.0,1.0
8,VAE,1.0,0.0,0.0,0.5,4177.0,7.0,2.0
9,Discriminative DL,1.0,0.0,0.0,0.5,9961.0,14.0,1.0


In [35]:
one_hot_reverse = model_prediction.copy()
one_hot_reverse = one_hot_reverse[['MAR', 'MCAR', 'MNAR']]

one_hot_reverse['Missing Type'] = one_hot_reverse.idxmax(1)
one_hot_reverse = one_hot_reverse[['Missing Type']]
one_hot_reverse

Unnamed: 0,Missing Type
0,MAR
1,MAR
2,MAR
3,MCAR
4,MNAR
5,MCAR
6,MCAR
7,MNAR
8,MAR
9,MAR


In [36]:
model_prediction = pd.concat([model_prediction, one_hot_reverse], axis=1)
model_prediction = model_prediction.drop(['MAR', 'MCAR', 'MNAR'], axis=1)
model_prediction['control'] = 20
model_prediction = model_prediction.rename(columns={"prediction": "Imputation_Method"})

model_prediction

Unnamed: 0,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control
0,KNN,0.3,20000.0,16.0,1.0,MAR,20
1,Random Forest,0.01,12960.0,0.0,9.0,MAR,20
2,Random Forest,0.3,5473.0,10.0,1.0,MAR,20
3,GAIN,0.5,5473.0,10.0,1.0,MCAR,20
4,Mean/Mode,0.5,5473.0,10.0,1.0,MNAR,20
5,GAIN,0.1,10992.0,16.0,1.0,MCAR,20
6,KNN,0.3,10992.0,16.0,1.0,MCAR,20
7,KNN,0.3,10992.0,16.0,1.0,MNAR,20
8,VAE,0.5,4177.0,7.0,2.0,MAR,20
9,Discriminative DL,0.5,9961.0,14.0,1.0,MAR,20


In [37]:
data_control = pd.read_csv('multi_imputed_full_info.csv')
#data_multi = pd.read_csv('multi_imputed_full_info.csv')

data_control


Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,734.0,17.0,20000.0,16.0,1.0,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.000000
1,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727724,0.0,0.727724,...,734.0,17.0,20000.0,16.0,1.0,26.0,2.0,MAR - 0.01,MAR - 0.01 - 6,0.000649
2,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725643,0.0,0.725766,...,734.0,17.0,20000.0,16.0,1.0,26.0,5.0,MAR - 0.01,MAR - 0.01 - 6,-0.001309
3,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725821,0.0,0.725778,...,734.0,17.0,20000.0,16.0,1.0,26.0,4.0,MAR - 0.01,MAR - 0.01 - 6,-0.001296
4,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727469,0.0,0.727828,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1205,KNN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.264992,0.0,0.265395,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.024776
1206,Mean/Mode,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239275,0.0,0.240780,...,743.0,21.0,20000.0,20.0,1.0,5.0,4.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000160
1207,VAE,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.241995,0.0,0.242421,...,743.0,21.0,20000.0,20.0,1.0,5.0,3.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.001801
1208,GAIN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239877,0.0,0.243434,...,743.0,21.0,20000.0,20.0,1.0,5.0,2.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.002815


In [38]:
result_improv = pd.merge(model_prediction, data_control, how="left", on=["Imputation_Method", "Missing Fraction", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures", "Missing Type"])
result_improv = result_improv[result_improv['Performance Difference to Average Best'].notna()]
#result_improv.to_csv('result_merge_control.csv')

result_improv

Unnamed: 0.1,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control,Task,Column,result_type,...,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,KNN,0.3,20000.0,16.0,1.0,MAR,20,6,x-box,downstream_performance_mean,...,59,letter,813.0,734.0,17.0,26.0,1.0,MAR - 0.3,MAR - 0.3 - 6,0.00201
1,Random Forest,0.01,12960.0,0.0,9.0,MAR,20,26,parents,downstream_performance_mean,...,53,nursery,4320.0,2.0,9.0,5.0,6.0,MAR - 0.01,MAR - 0.01 - 26,0.0
2,Random Forest,0.3,5473.0,10.0,1.0,MAR,20,30,eccen,downstream_performance_mean,...,48,page-blocks,4913.0,28.0,11.0,5.0,4.0,MAR - 0.3,MAR - 0.3 - 30,0.0
3,GAIN,0.5,5473.0,10.0,1.0,MCAR,20,30,eccen,downstream_performance_mean,...,48,page-blocks,4913.0,28.0,11.0,5.0,2.0,MCAR - 0.5,MCAR - 0.5 - 30,-0.040156
4,Mean/Mode,0.5,5473.0,10.0,1.0,MNAR,20,30,eccen,downstream_performance_mean,...,48,page-blocks,4913.0,28.0,11.0,5.0,1.0,MNAR - 0.5,MNAR - 0.5 - 30,0.018207
5,GAIN,0.1,10992.0,16.0,1.0,MCAR,20,32,input4,downstream_performance_mean,...,55,pendigits,1144.0,1055.0,17.0,10.0,1.0,MCAR - 0.1,MCAR - 0.1 - 32,0.00146
6,KNN,0.3,10992.0,16.0,1.0,MCAR,20,32,input4,downstream_performance_mean,...,55,pendigits,1144.0,1055.0,17.0,10.0,1.0,MCAR - 0.3,MCAR - 0.3 - 32,0.00058
7,KNN,0.3,10992.0,16.0,1.0,MNAR,20,32,input4,downstream_performance_mean,...,55,pendigits,1144.0,1055.0,17.0,10.0,1.0,MNAR - 0.3,MNAR - 0.3 - 32,0.004005
8,VAE,0.5,4177.0,7.0,2.0,MAR,20,183,Length,downstream_performance_mean,...,46,abalone,689.0,1.0,9.0,28.0,1.0,MAR - 0.5,MAR - 0.5 - 183,0.012025
9,Discriminative DL,0.5,9961.0,14.0,1.0,MAR,20,375,coefficient3,downstream_performance_mean,...,54,JapaneseVowels,1614.0,782.0,15.0,9.0,1.0,MAR - 0.5,MAR - 0.5 - 375,0.011476


In [39]:
av_improv_model = result_improv['Performance Difference to Average Best'].mean()
print(av_improv_model)

0.0024474759037190525


## Experiment Results adjusted for 0.01 F1 Score Points

For the training we use the experiment results, but replaced the best method for each data constellation with the average best method, if the average best method is not outscored by at least 0.01 F1 score points.

### Training

In [87]:
data = pd.read_csv('multi_properties_train_dataset_1_percent.csv')

data

Unnamed: 0.1,Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,504,Random Forest,MAR,0.01,10218.0,7.0,1.0
1,573,Random Forest,MAR,0.01,28056.0,3.0,4.0
2,645,Random Forest,MAR,0.01,5456.0,4.0,1.0
3,284,Random Forest,MAR,0.01,4177.0,7.0,2.0
4,360,Random Forest,MAR,0.01,28056.0,0.0,7.0
...,...,...,...,...,...,...,...
199,1070,VAE,MNAR,0.50,58000.0,9.0,1.0
200,1140,Random Forest,MNAR,0.50,44819.0,6.0,1.0
201,1209,KNN,MNAR,0.50,20000.0,20.0,1.0
202,786,Mean/Mode,MNAR,0.50,5665.0,2.0,15.0


In [88]:
# OneHot-encoder -> multiple column approach
# 

transformer = make_column_transformer(
    (OneHotEncoder(), ['Imputation_Method', 'Missing Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
data_preprocessed = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
data_preprocessed


Unnamed: 0,onehotencoder__Imputation_Method_Discriminative DL,onehotencoder__Imputation_Method_GAIN,onehotencoder__Imputation_Method_KNN,onehotencoder__Imputation_Method_Mean/Mode,onehotencoder__Imputation_Method_Random Forest,onehotencoder__Imputation_Method_VAE,onehotencoder__Missing Type_MAR,onehotencoder__Missing Type_MCAR,onehotencoder__Missing Type_MNAR,remainder__Unnamed: 0,remainder__Missing Fraction,remainder__NumberOfInstances,remainder__NumberOfNumericFeatures,remainder__NumberOfCategoricalFeatures
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,504.0,0.01,10218.0,7.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,573.0,0.01,28056.0,3.0,4.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,645.0,0.01,5456.0,4.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,284.0,0.01,4177.0,7.0,2.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,360.0,0.01,28056.0,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1070.0,0.50,58000.0,9.0,1.0
200,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1140.0,0.50,44819.0,6.0,1.0
201,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1209.0,0.50,20000.0,20.0,1.0
202,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,786.0,0.50,5665.0,2.0,15.0


In [89]:
# Umwandlung in numpy arrays
X = data_preprocessed[['onehotencoder__Missing Type_MAR', 'onehotencoder__Missing Type_MCAR', 
                         'onehotencoder__Missing Type_MNAR', 'remainder__Missing Fraction', 
                         'remainder__NumberOfInstances',  
                         'remainder__NumberOfNumericFeatures', 'remainder__NumberOfCategoricalFeatures']].to_numpy()
#X = data_preprocessed_x
y = data[['Imputation_Method']].to_numpy()


# trin/test split --> 20 % Testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train.shape
#print(X_train)

(163, 7)

In [90]:
y_train.shape

(163, 1)

In [91]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(y)



[[-0.73330475 -0.67480156  1.4012981  ...  2.69478628  0.08115931
  -0.49914772]
 [-0.73330475  1.48191715 -0.71362403 ... -0.38092409  0.85028534
  -0.49914772]
 [ 1.36368953 -0.67480156 -0.71362403 ...  0.77761333 -1.30326755
   0.45992197]
 ...
 [-0.73330475  1.48191715 -0.71362403 ... -0.31491403  1.15793576
  -0.49914772]
 [-0.73330475 -0.67480156  1.4012981  ... -0.77717654  1.92706179
  -0.49914772]
 [-0.73330475  1.48191715 -0.71362403 ... -0.3644696  -0.2264911
  -0.49914772]]
[['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Mean/Mode']
 ['VAE']
 ['Random Forest']
 ['Random Forest']
 ['Mean/Mode']
 ['GAIN']
 ['Random Forest']
 ['KNN']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['KNN']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['GAIN']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['GAIN']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['Me

In [92]:
X_train.shape

(163, 7)

In [93]:
y.shape

(204, 1)

### Random Forest with Nested Cross Validation, with GridSearch

In [100]:
X_train = X
y_train = y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
outer_results_1 = list()
predicitons_list_1 = list()
test_data_list_1 = list()
ground_truth_list_1 = list()

inner_nested_cv_results= []
inner_nested_best_score= []
inner_nested_best_params= []
inner_nested_test_score = []

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [25, 50, 100, 500, 1000, 3000, 4500],#, 3000, 4000, 5000, 6000, 7000],
#    'max_features': ['sqrt', 'log2'],
#    'criterion' :['gini', 'entropy'],
        'max_depth': [3,6,9,12,15,18]#,3,4,5,6,7]
    }

    grid_clf = GridSearchCV(clf, param_grid, cv=inner_cv)
    result = grid_clf.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_
    print(grid_clf.best_params_)
    print(best_model)

    inner_nested_cv_results.append(grid_clf.cv_results_)
    inner_nested_best_score.append(grid_clf.best_score_)
    inner_nested_best_params.append(grid_clf.best_params_)
    inner_nested_test_score.append(grid_clf.score(X_test, y_test))

    yhat = best_model.predict(X_test)
#    print(yhat)
#    print('___________________----------\n')
#    print(X_test)
#    print('___________________----------\n')
#    print(y_test)
    acc = accuracy_score(y_test, yhat)
    outer_results_1.append(acc)
    predicitons_list_1.append(yhat)
    test_data_list_1.append(X_test)
    ground_truth_list_1.append(y_test)
    
   
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print('_____________________________')
    print('\n')
    
print('Accuracy: %.3f (%.3f)' % (mean(outer_results_1), std(outer_results_1)))    



{'max_depth': 12, 'n_estimators': 1000}
RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=1000,
                       random_state=42)
>acc=0.429, est=0.438, cfg={'max_depth': 12, 'n_estimators': 1000}
_____________________________


{'max_depth': 15, 'n_estimators': 100}
RandomForestClassifier(class_weight='balanced', max_depth=15, random_state=42)
>acc=0.381, est=0.454, cfg={'max_depth': 15, 'n_estimators': 100}
_____________________________


{'max_depth': 15, 'n_estimators': 25}
RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=25,
                       random_state=42)
>acc=0.524, est=0.448, cfg={'max_depth': 15, 'n_estimators': 25}
_____________________________


{'max_depth': 9, 'n_estimators': 100}
RandomForestClassifier(class_weight='balanced', max_depth=9, random_state=42)
>acc=0.333, est=0.464, cfg={'max_depth': 9, 'n_estimators': 100}
_____________________________


{'max_depth': 12, 'n_estimators': 3000}
RandomForestClass

In [101]:
print(type(yhat))
print(type(X_test))
print(type(y_test))
print(outer_results_1)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.42857142857142855, 0.38095238095238093, 0.5238095238095238, 0.3333333333333333, 0.5, 0.35, 0.4, 0.35, 0.4, 0.45]


In [102]:
#np.mean(inner_nested_test_score)

In [103]:
#inner_nested_cv_results

In [104]:
inner_nested_best_params

[{'max_depth': 12, 'n_estimators': 1000},
 {'max_depth': 15, 'n_estimators': 100},
 {'max_depth': 15, 'n_estimators': 25},
 {'max_depth': 9, 'n_estimators': 100},
 {'max_depth': 12, 'n_estimators': 3000},
 {'max_depth': 15, 'n_estimators': 25},
 {'max_depth': 12, 'n_estimators': 25},
 {'max_depth': 9, 'n_estimators': 500},
 {'max_depth': 9, 'n_estimators': 3000},
 {'max_depth': 9, 'n_estimators': 50}]

In [105]:
inner_nested_test_score

[0.42857142857142855,
 0.38095238095238093,
 0.5238095238095238,
 0.3333333333333333,
 0.5,
 0.35,
 0.4,
 0.35,
 0.4,
 0.45]

In [106]:
# Manually select the right model (best accuracy)

outer_results_item = outer_results_1[2]
print(outer_results_item)
df_predict_item = predicitons_list_1[2]
#df_predict_item
test_data_item = test_data_list_1[2]
ground_truth_item = ground_truth_list_1[2]

0.5238095238095238


In [107]:
df_predict = pd.DataFrame()
df_predict['prediction'] = df_predict_item
df_predict

Unnamed: 0,prediction
0,Random Forest
1,Random Forest
2,Random Forest
3,KNN
4,Random Forest
5,VAE
6,Random Forest
7,Random Forest
8,Random Forest
9,Random Forest


In [108]:
df_testdata = pd.DataFrame({'MAR': test_data_item[:, 0], 'MCAR': test_data_item[:, 1], 'MNAR': test_data_item[:, 2], 'Missing Fraction': test_data_item[:, 3], 'NumberOfInstances': test_data_item[:, 4],'NumberOfNumericFeatures': test_data_item[:, 5], 'NumberOfCategoricalFeatures': test_data_item[:, 6]})
df_testdata


Unnamed: 0,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,1.0,0.0,0.0,0.01,12960.0,0.0,9.0
1,1.0,0.0,0.0,0.1,5456.0,4.0,1.0
2,1.0,0.0,0.0,0.1,10992.0,16.0,1.0
3,1.0,0.0,0.0,0.1,58000.0,9.0,1.0
4,1.0,0.0,0.0,0.3,28056.0,0.0,7.0
5,1.0,0.0,0.0,0.5,3772.0,21.0,1.0
6,0.0,1.0,0.0,0.01,12960.0,0.0,9.0
7,0.0,1.0,0.0,0.1,12960.0,0.0,9.0
8,0.0,1.0,0.0,0.1,9961.0,14.0,1.0
9,0.0,1.0,0.0,0.1,3200.0,0.0,25.0


In [109]:
model_prediction = pd.DataFrame()
model_prediction = pd.concat([df_predict, df_testdata], axis=1)

#model_prediction.to_csv('model_prediction.csv', index=False)
model_prediction

Unnamed: 0,prediction,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Random Forest,1.0,0.0,0.0,0.01,12960.0,0.0,9.0
1,Random Forest,1.0,0.0,0.0,0.1,5456.0,4.0,1.0
2,Random Forest,1.0,0.0,0.0,0.1,10992.0,16.0,1.0
3,KNN,1.0,0.0,0.0,0.1,58000.0,9.0,1.0
4,Random Forest,1.0,0.0,0.0,0.3,28056.0,0.0,7.0
5,VAE,1.0,0.0,0.0,0.5,3772.0,21.0,1.0
6,Random Forest,0.0,1.0,0.0,0.01,12960.0,0.0,9.0
7,Random Forest,0.0,1.0,0.0,0.1,12960.0,0.0,9.0
8,Random Forest,0.0,1.0,0.0,0.1,9961.0,14.0,1.0
9,Random Forest,0.0,1.0,0.0,0.1,3200.0,0.0,25.0


In [110]:
one_hot_reverse = model_prediction.copy()
one_hot_reverse = one_hot_reverse[['MAR', 'MCAR', 'MNAR']]

one_hot_reverse['Missing Type'] = one_hot_reverse.idxmax(1)
one_hot_reverse = one_hot_reverse[['Missing Type']]
one_hot_reverse

Unnamed: 0,Missing Type
0,MAR
1,MAR
2,MAR
3,MAR
4,MAR
5,MAR
6,MCAR
7,MCAR
8,MCAR
9,MCAR


In [111]:
model_prediction = pd.concat([model_prediction, one_hot_reverse], axis=1)
model_prediction = model_prediction.drop(['MAR', 'MCAR', 'MNAR'], axis=1)
model_prediction['control'] = 20
model_prediction = model_prediction.rename(columns={"prediction": "Imputation_Method"})

model_prediction

Unnamed: 0,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control
0,Random Forest,0.01,12960.0,0.0,9.0,MAR,20
1,Random Forest,0.1,5456.0,4.0,1.0,MAR,20
2,Random Forest,0.1,10992.0,16.0,1.0,MAR,20
3,KNN,0.1,58000.0,9.0,1.0,MAR,20
4,Random Forest,0.3,28056.0,0.0,7.0,MAR,20
5,VAE,0.5,3772.0,21.0,1.0,MAR,20
6,Random Forest,0.01,12960.0,0.0,9.0,MCAR,20
7,Random Forest,0.1,12960.0,0.0,9.0,MCAR,20
8,Random Forest,0.1,9961.0,14.0,1.0,MCAR,20
9,Random Forest,0.1,3200.0,0.0,25.0,MCAR,20


In [112]:
data_control = pd.read_csv('multi_imputed_full_info.csv')
#data_multi = pd.read_csv('multi_imputed_full_info.csv')
data_control

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,734.0,17.0,20000.0,16.0,1.0,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.000000
1,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727724,0.0,0.727724,...,734.0,17.0,20000.0,16.0,1.0,26.0,2.0,MAR - 0.01,MAR - 0.01 - 6,0.000649
2,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725643,0.0,0.725766,...,734.0,17.0,20000.0,16.0,1.0,26.0,5.0,MAR - 0.01,MAR - 0.01 - 6,-0.001309
3,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725821,0.0,0.725778,...,734.0,17.0,20000.0,16.0,1.0,26.0,4.0,MAR - 0.01,MAR - 0.01 - 6,-0.001296
4,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727469,0.0,0.727828,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1205,KNN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.264992,0.0,0.265395,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.024776
1206,Mean/Mode,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239275,0.0,0.240780,...,743.0,21.0,20000.0,20.0,1.0,5.0,4.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000160
1207,VAE,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.241995,0.0,0.242421,...,743.0,21.0,20000.0,20.0,1.0,5.0,3.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.001801
1208,GAIN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239877,0.0,0.243434,...,743.0,21.0,20000.0,20.0,1.0,5.0,2.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.002815


In [113]:
result_improv = pd.merge(model_prediction, data_control, how="left", on=["Imputation_Method", "Missing Fraction", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures", "Missing Type"])
result_improv = result_improv[result_improv['Performance Difference to Average Best'].notna()]
#result_improv.to_csv('result_merge_control.csv')

result_improv

Unnamed: 0.1,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control,Task,Column,result_type,...,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,0.01,12960.0,0.0,9.0,MAR,20,26,parents,downstream_performance_mean,...,53,nursery,4320.0,2.0,9.0,5.0,6.0,MAR - 0.01,MAR - 0.01 - 26,0.0
1,Random Forest,0.1,5456.0,4.0,1.0,MAR,20,1526,V3,downstream_performance_mean,...,45,wall-robot-navigation,2205.0,328.0,5.0,4.0,1.0,MAR - 0.1,MAR - 0.1 - 1526,0.0
2,Random Forest,0.1,10992.0,16.0,1.0,MAR,20,32,input4,downstream_performance_mean,...,55,pendigits,1144.0,1055.0,17.0,10.0,3.0,MAR - 0.1,MAR - 0.1 - 32,0.0
3,KNN,0.1,58000.0,9.0,1.0,MAR,20,40685,A2,downstream_performance_mean,...,62,shuttle,45586.0,10.0,10.0,7.0,4.0,MAR - 0.1,MAR - 0.1 - 40685,-0.000154
4,Random Forest,0.3,28056.0,0.0,7.0,MAR,20,184,black_king_col,downstream_performance_mean,...,57,kropt,4553.0,27.0,7.0,18.0,1.0,MAR - 0.3,MAR - 0.3 - 184,0.0
5,VAE,0.5,3772.0,21.0,1.0,MAR,20,40497,V13,downstream_performance_mean,...,51,thyroid-ann,3488.0,93.0,22.0,3.0,6.0,MAR - 0.5,MAR - 0.5 - 40497,-0.126342
6,Random Forest,0.01,12960.0,0.0,9.0,MCAR,20,26,parents,downstream_performance_mean,...,53,nursery,4320.0,2.0,9.0,5.0,6.0,MCAR - 0.01,MCAR - 0.01 - 26,0.0
7,Random Forest,0.1,12960.0,0.0,9.0,MCAR,20,26,parents,downstream_performance_mean,...,53,nursery,4320.0,2.0,9.0,5.0,6.0,MCAR - 0.1,MCAR - 0.1 - 26,0.0
8,Random Forest,0.1,9961.0,14.0,1.0,MCAR,20,375,coefficient3,downstream_performance_mean,...,54,JapaneseVowels,1614.0,782.0,15.0,9.0,3.0,MCAR - 0.1,MCAR - 0.1 - 375,0.0
9,Random Forest,0.1,3200.0,0.0,25.0,MCAR,20,40677,attribute#3,downstream_performance_mean,...,49,led24,337.0,296.0,25.0,10.0,3.0,MCAR - 0.1,MCAR - 0.1 - 40677,0.0


In [114]:
av_improv_model = result_improv['Performance Difference to Average Best'].mean()
print(av_improv_model)

-0.005424857164122671


## Experiment Results adjusted for 0.03 F1 Score Points

For the training we use the experiment results, but replaced the best method for each data constellation with the average best method, if the average best method is not outscored by at least 0.03 F1 score points.

### Training

In [115]:
data = pd.read_csv('multi_properties_train_dataset_3_percent.csv')
#data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0.1,Unnamed: 0,Imputation_Method,Missing Type,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,504,Random Forest,MAR,0.01,10218.0,7.0,1.0
1,573,Random Forest,MAR,0.01,28056.0,3.0,4.0
2,645,Random Forest,MAR,0.01,5456.0,4.0,1.0
3,284,Random Forest,MAR,0.01,4177.0,7.0,2.0
4,360,Random Forest,MAR,0.01,28056.0,0.0,7.0
...,...,...,...,...,...,...,...
199,1067,Random Forest,MNAR,0.50,58000.0,9.0,1.0
200,1140,Random Forest,MNAR,0.50,44819.0,6.0,1.0
201,1204,Random Forest,MNAR,0.50,20000.0,20.0,1.0
202,784,Random Forest,MNAR,0.50,5665.0,2.0,15.0


In [116]:
# OneHot-encoder -> multiple column approach
# 

transformer = make_column_transformer(
    (OneHotEncoder(), ['Imputation_Method', 'Missing Type']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
data_preprocessed = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
data_preprocessed


Unnamed: 0,onehotencoder__Imputation_Method_GAIN,onehotencoder__Imputation_Method_KNN,onehotencoder__Imputation_Method_Mean/Mode,onehotencoder__Imputation_Method_Random Forest,onehotencoder__Imputation_Method_VAE,onehotencoder__Missing Type_MAR,onehotencoder__Missing Type_MCAR,onehotencoder__Missing Type_MNAR,remainder__Unnamed: 0,remainder__Missing Fraction,remainder__NumberOfInstances,remainder__NumberOfNumericFeatures,remainder__NumberOfCategoricalFeatures
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,504.0,0.01,10218.0,7.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,573.0,0.01,28056.0,3.0,4.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,645.0,0.01,5456.0,4.0,1.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,284.0,0.01,4177.0,7.0,2.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,360.0,0.01,28056.0,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1067.0,0.50,58000.0,9.0,1.0
200,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1140.0,0.50,44819.0,6.0,1.0
201,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1204.0,0.50,20000.0,20.0,1.0
202,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,784.0,0.50,5665.0,2.0,15.0


In [117]:
# Umwandlung in numpy arrays
X = data_preprocessed[['onehotencoder__Missing Type_MAR', 'onehotencoder__Missing Type_MCAR', 
                         'onehotencoder__Missing Type_MNAR', 'remainder__Missing Fraction', 
                         'remainder__NumberOfInstances',  
                         'remainder__NumberOfNumericFeatures', 'remainder__NumberOfCategoricalFeatures']].to_numpy()
#X = data_preprocessed_x
y = data[['Imputation_Method']].to_numpy()


# Aufteilung in train und test Daten --> 20 % Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_train.shape
#print(X_train)

(163, 7)

In [118]:
y_train.shape

(163, 1)

In [119]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(y)



[[-0.73330475 -0.67480156  1.4012981  ...  2.69478628  0.08115931
  -0.49914772]
 [-0.73330475  1.48191715 -0.71362403 ... -0.38092409  0.85028534
  -0.49914772]
 [ 1.36368953 -0.67480156 -0.71362403 ...  0.77761333 -1.30326755
   0.45992197]
 ...
 [-0.73330475  1.48191715 -0.71362403 ... -0.31491403  1.15793576
  -0.49914772]
 [-0.73330475 -0.67480156  1.4012981  ... -0.77717654  1.92706179
  -0.49914772]
 [-0.73330475  1.48191715 -0.71362403 ... -0.3644696  -0.2264911
  -0.49914772]]
[['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['KNN']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['Random Forest']
 ['GAIN']
 ['Random Forest']
 ['Random Forest']
 ['VAE']
 ['GAIN']
 ['Random Forest']
 ['Ran

In [120]:
X_train.shape

(163, 7)

In [121]:
y.shape

(204, 1)

### Random Forest with Nested Cross Validation, with GridSearch

In [128]:
X_train = X
y_train = y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
outer_results_3 = list()
predicitons_list_3 = list()
test_data_list_3 = list()
ground_truth_list_3 = list()

inner_nested_cv_results= []
inner_nested_best_score= []
inner_nested_best_params= []
inner_nested_test_score = []

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')

    param_grid = {
        'n_estimators': [25, 50, 100, 500, 1000, 3000, 6000, 10000],#, 3000, 4000, 5000, 6000, 7000],
#    'max_features': ['sqrt', 'log2'],
#    'criterion' :['gini', 'entropy'],
        'max_depth': [3,6,9,12,15]#,3,4,5,6,7]
    }

    grid_clf = GridSearchCV(clf, param_grid, cv=inner_cv)
    result = grid_clf.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_
    print(grid_clf.best_params_)
    print(best_model)

    inner_nested_cv_results.append(grid_clf.cv_results_)
    inner_nested_best_score.append(grid_clf.best_score_)
    inner_nested_best_params.append(grid_clf.best_params_)
    inner_nested_test_score.append(grid_clf.score(X_test, y_test))

    yhat = best_model.predict(X_test)
#    print(yhat)
#    print('___________________----------\n')
#    print(X_test)
#    print('___________________----------\n')
#    print(y_test)
    acc = accuracy_score(y_test, yhat)
    outer_results_3.append(acc)
    predicitons_list_3.append(yhat)
    test_data_list_3.append(X_test)
    ground_truth_list_3.append(y_test)
    
   
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print('_____________________________')
    print('\n')
    
print('Accuracy: %.3f (%.3f)' % (mean(outer_results_3), std(outer_results_3)))    


{'max_depth': 9, 'n_estimators': 25}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=25,
                       random_state=42)
>acc=0.857, est=0.836, cfg={'max_depth': 9, 'n_estimators': 25}
_____________________________


{'max_depth': 9, 'n_estimators': 6000}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=6000,
                       random_state=42)
>acc=0.905, est=0.809, cfg={'max_depth': 9, 'n_estimators': 6000}
_____________________________


{'max_depth': 9, 'n_estimators': 50}
RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=50,
                       random_state=42)
>acc=0.810, est=0.820, cfg={'max_depth': 9, 'n_estimators': 50}
_____________________________


{'max_depth': 6, 'n_estimators': 500}
RandomForestClassifier(class_weight='balanced', max_depth=6, n_estimators=500,
                       random_state=42)
>acc=0.810, est=0.847, cfg={'max_depth': 6, 'n_estimators': 500}
___________________

In [129]:
print(type(yhat))
print(type(X_test))
print(type(y_test))
print(outer_results_3)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.8571428571428571, 0.9047619047619048, 0.8095238095238095, 0.8095238095238095, 0.65, 0.8, 0.75, 0.9, 0.8, 0.85]


In [130]:
#np.mean(inner_nested_test_score)

In [131]:
#inner_nested_cv_results

In [132]:
inner_nested_best_params

[{'max_depth': 9, 'n_estimators': 25},
 {'max_depth': 9, 'n_estimators': 6000},
 {'max_depth': 9, 'n_estimators': 50},
 {'max_depth': 6, 'n_estimators': 500},
 {'max_depth': 9, 'n_estimators': 6000},
 {'max_depth': 12, 'n_estimators': 500},
 {'max_depth': 6, 'n_estimators': 100},
 {'max_depth': 12, 'n_estimators': 1000},
 {'max_depth': 6, 'n_estimators': 25},
 {'max_depth': 12, 'n_estimators': 6000}]

In [133]:
inner_nested_test_score

[0.8571428571428571,
 0.9047619047619048,
 0.8095238095238095,
 0.8095238095238095,
 0.65,
 0.8,
 0.75,
 0.9,
 0.8,
 0.85]

In [143]:
# Manually select the right model (best accuracy)

outer_results_item = outer_results_3[1]
print(outer_results_item)
df_predict_item = predicitons_list_3[1]
#df_predict_item
test_data_item = test_data_list_3[1]
ground_truth_item = ground_truth_list_3[1]

0.9047619047619048


In [144]:
df_predict = pd.DataFrame()
df_predict['prediction'] = df_predict_item
df_predict

Unnamed: 0,prediction
0,Random Forest
1,Random Forest
2,Random Forest
3,Random Forest
4,Random Forest
5,Random Forest
6,Random Forest
7,Random Forest
8,Random Forest
9,GAIN


In [145]:
df_testdata = pd.DataFrame({'MAR': test_data_item[:, 0], 'MCAR': test_data_item[:, 1], 'MNAR': test_data_item[:, 2], 'Missing Fraction': test_data_item[:, 3], 'NumberOfInstances': test_data_item[:, 4],'NumberOfNumericFeatures': test_data_item[:, 5], 'NumberOfCategoricalFeatures': test_data_item[:, 6]})
df_testdata


Unnamed: 0,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,1.0,0.0,0.0,0.01,20000.0,16.0,1.0
1,1.0,0.0,0.0,0.1,44819.0,6.0,1.0
2,1.0,0.0,0.0,0.5,20000.0,20.0,1.0
3,1.0,0.0,0.0,0.5,20000.0,16.0,1.0
4,0.0,1.0,0.0,0.01,10218.0,7.0,1.0
5,0.0,1.0,0.0,0.01,9961.0,14.0,1.0
6,0.0,1.0,0.0,0.01,20000.0,20.0,1.0
7,0.0,1.0,0.0,0.01,20000.0,16.0,1.0
8,0.0,1.0,0.0,0.1,28056.0,3.0,4.0
9,0.0,1.0,0.0,0.1,4898.0,11.0,1.0


In [146]:
model_prediction = pd.DataFrame()
model_prediction = pd.concat([df_predict, df_testdata], axis=1)

#model_prediction.to_csv('model_prediction.csv', index=False)
model_prediction

Unnamed: 0,prediction,MAR,MCAR,MNAR,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,Random Forest,1.0,0.0,0.0,0.01,20000.0,16.0,1.0
1,Random Forest,1.0,0.0,0.0,0.1,44819.0,6.0,1.0
2,Random Forest,1.0,0.0,0.0,0.5,20000.0,20.0,1.0
3,Random Forest,1.0,0.0,0.0,0.5,20000.0,16.0,1.0
4,Random Forest,0.0,1.0,0.0,0.01,10218.0,7.0,1.0
5,Random Forest,0.0,1.0,0.0,0.01,9961.0,14.0,1.0
6,Random Forest,0.0,1.0,0.0,0.01,20000.0,20.0,1.0
7,Random Forest,0.0,1.0,0.0,0.01,20000.0,16.0,1.0
8,Random Forest,0.0,1.0,0.0,0.1,28056.0,3.0,4.0
9,GAIN,0.0,1.0,0.0,0.1,4898.0,11.0,1.0


In [147]:
one_hot_reverse = model_prediction.copy()
one_hot_reverse = one_hot_reverse[['MAR', 'MCAR', 'MNAR']]

one_hot_reverse['Missing Type'] = one_hot_reverse.idxmax(1)
one_hot_reverse = one_hot_reverse[['Missing Type']]
one_hot_reverse

Unnamed: 0,Missing Type
0,MAR
1,MAR
2,MAR
3,MAR
4,MCAR
5,MCAR
6,MCAR
7,MCAR
8,MCAR
9,MCAR


In [148]:
model_prediction = pd.concat([model_prediction, one_hot_reverse], axis=1)
model_prediction = model_prediction.drop(['MAR', 'MCAR', 'MNAR'], axis=1)
model_prediction['control'] = 20
model_prediction = model_prediction.rename(columns={"prediction": "Imputation_Method"})

model_prediction

Unnamed: 0,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control
0,Random Forest,0.01,20000.0,16.0,1.0,MAR,20
1,Random Forest,0.1,44819.0,6.0,1.0,MAR,20
2,Random Forest,0.5,20000.0,20.0,1.0,MAR,20
3,Random Forest,0.5,20000.0,16.0,1.0,MAR,20
4,Random Forest,0.01,10218.0,7.0,1.0,MCAR,20
5,Random Forest,0.01,9961.0,14.0,1.0,MCAR,20
6,Random Forest,0.01,20000.0,20.0,1.0,MCAR,20
7,Random Forest,0.01,20000.0,16.0,1.0,MCAR,20
8,Random Forest,0.1,28056.0,3.0,4.0,MCAR,20
9,GAIN,0.1,4898.0,11.0,1.0,MCAR,20


In [149]:
data_control = pd.read_csv('multi_imputed_full_info.csv')
#data_multi = pd.read_csv('multi_imputed_full_info.csv')

data_control

Unnamed: 0,Imputation_Method,Task,Missing Type,Missing Fraction,Column,result_type,metric,Baseline,Corrupted,Imputed,...,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727330,0.0,0.727075,...,734.0,17.0,20000.0,16.0,1.0,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.000000
1,KNN,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727724,0.0,0.727724,...,734.0,17.0,20000.0,16.0,1.0,26.0,2.0,MAR - 0.01,MAR - 0.01 - 6,0.000649
2,Mean/Mode,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725643,0.0,0.725766,...,734.0,17.0,20000.0,16.0,1.0,26.0,5.0,MAR - 0.01,MAR - 0.01 - 6,-0.001309
3,VAE,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.725821,0.0,0.725778,...,734.0,17.0,20000.0,16.0,1.0,26.0,4.0,MAR - 0.01,MAR - 0.01 - 6,-0.001296
4,Discriminative DL,6,MAR,0.01,x-box,downstream_performance_mean,F1_macro,0.727469,0.0,0.727828,...,734.0,17.0,20000.0,16.0,1.0,26.0,1.0,MAR - 0.01,MAR - 0.01 - 6,0.000753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1205,KNN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.264992,0.0,0.265395,...,743.0,21.0,20000.0,20.0,1.0,5.0,1.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.024776
1206,Mean/Mode,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239275,0.0,0.240780,...,743.0,21.0,20000.0,20.0,1.0,5.0,4.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.000160
1207,VAE,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.241995,0.0,0.242421,...,743.0,21.0,20000.0,20.0,1.0,5.0,3.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.001801
1208,GAIN,41671,MNAR,0.50,a9,downstream_performance_mean,F1_macro,0.239877,0.0,0.243434,...,743.0,21.0,20000.0,20.0,1.0,5.0,2.0,MNAR - 0.5,MNAR - 0.5 - 41671,0.002815


In [150]:
result_improv = pd.merge(model_prediction, data_control, how="left", on=["Imputation_Method", "Missing Fraction", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures", "Missing Type"])
result_improv = result_improv[result_improv['Performance Difference to Average Best'].notna()]
#result_improv.to_csv('result_merge_control.csv')

result_improv

Unnamed: 0.1,Imputation_Method,Missing Fraction,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures,Missing Type,control,Task,Column,result_type,...,Unnamed: 0,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfClasses,Downstream Performance Rank,Data_Constellation,Data_Constellation_full,Performance Difference to Average Best
0,Random Forest,0.01,20000.0,16.0,1.0,MAR,20,6,x-box,downstream_performance_mean,...,59,letter,813.0,734.0,17.0,26.0,3.0,MAR - 0.01,MAR - 0.01 - 6,0.0
1,Random Forest,0.1,44819.0,6.0,1.0,MAR,20,41027,black_piece0_strength,downstream_performance_mean,...,58,jungle_chess_2pcs_raw_endgame_complete,23062.0,4335.0,7.0,3.0,3.0,MAR - 0.1,MAR - 0.1 - 41027,0.0
2,Random Forest,0.5,20000.0,20.0,1.0,MAR,20,41671,a9,downstream_performance_mean,...,61,microaggregation2,11162.0,743.0,21.0,5.0,5.0,MAR - 0.5,MAR - 0.5 - 41671,0.0
3,Random Forest,0.5,20000.0,16.0,1.0,MAR,20,6,x-box,downstream_performance_mean,...,59,letter,813.0,734.0,17.0,26.0,1.0,MAR - 0.5,MAR - 0.5 - 6,0.0
4,Random Forest,0.01,10218.0,7.0,1.0,MCAR,20,1459,V7,downstream_performance_mean,...,50,artificial-characters,1416.0,600.0,8.0,10.0,5.0,MCAR - 0.01,MCAR - 0.01 - 1459,0.0
5,Random Forest,0.01,9961.0,14.0,1.0,MCAR,20,375,coefficient3,downstream_performance_mean,...,54,JapaneseVowels,1614.0,782.0,15.0,9.0,2.0,MCAR - 0.01,MCAR - 0.01 - 375,0.0
6,Random Forest,0.01,20000.0,20.0,1.0,MCAR,20,41671,a9,downstream_performance_mean,...,61,microaggregation2,11162.0,743.0,21.0,5.0,1.0,MCAR - 0.01,MCAR - 0.01 - 41671,0.0
7,Random Forest,0.01,20000.0,16.0,1.0,MCAR,20,6,x-box,downstream_performance_mean,...,59,letter,813.0,734.0,17.0,26.0,2.0,MCAR - 0.01,MCAR - 0.01 - 6,0.0
8,Random Forest,0.1,28056.0,3.0,4.0,MCAR,20,1481,V2,downstream_performance_mean,...,56,kr-vs-k,4553.0,27.0,7.0,18.0,4.0,MCAR - 0.1,MCAR - 0.1 - 1481,0.0
9,GAIN,0.1,4898.0,11.0,1.0,MCAR,20,40498,V11,downstream_performance_mean,...,47,wine-quality-white,2198.0,5.0,12.0,7.0,6.0,MCAR - 0.1,MCAR - 0.1 - 40498,-0.068748


In [151]:
av_improv_model = result_improv['Performance Difference to Average Best'].mean()
print(av_improv_model)

-0.0032737338025079286
