In [1]:
from jenga.tasks.openml import OpenMLBinaryClassificationTask, OpenMLMultiClassClassificationTask, OpenMLRegressionTask
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.dl import AutoKerasImputer
from data_imputation_paper.imputation.generative import GAINImputer, VAEImputer
from data_imputation_paper.evaluation import SingleColumnEvaluator, MultipleColumnsEvaluator, SingleColumnAllMissingEvaluator, MultipleColumnsAllMissingEvaluator

%load_ext autoreload
%autoreload 2

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLMultiClassClassificationTask(openml_id=4552, seed=seed)

## Insert missing values using jenga

In [4]:
target_columns = ['V2', 'V4', 'V8', 'V15']

## Mode Imputation

In [5]:
arguments = {
    "seed": seed
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[0], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V2
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 0.0009
            train         test
MAE     29.534246    29.329695
MSE   1382.765079  1395.011245
RMSE    37.182935    37.339889

             baseline  corrupted   imputed
F1_micro     0.729038   0.729038  0.729038
F1_macro     0.443047   0.432501  0.432501
F1_weighted  0.732119   0.731751  0.731751


CPU times: user 2.38 s, sys: 138 ms, total: 2.52 s
Wall time: 3.91 s


In [6]:
arguments = {
    "seed": seed
}

%time MultipleColumnsEvaluator(task, 0.5, "MCAR", target_columns[:2], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 2 target columns: V2, V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 0.0009
            train         test
MAE     29.920306    28.966446
MSE   1426.373567  1385.846668
RMSE    37.763419    37.226873

             baseline  corrupted   imputed
F1_micro     0.729038   0.721389  0.721389
F1_macro     0.443047   0.431010  0.431010
F1_weighted  0.732119   0.723656  0.723656


Target Column: V4 - Necessary train time in seconds: 0.0012
                train      test
F1_micro     0.829067  0.842167
F1_macro     0.453269  0.457155
F1_weighted  0.751597  0.770024

             baseline  corrupted   imputed
F1_micro     0.729038   0.723448  0.723448
F1_macro     0.443047   0.418190  0.418190
F1_weighted  0.732119   0.725510  0.725510


CPU times: user 731 ms, sys: 14.3 ms, total: 745 ms
Wall time: 747 ms


In [7]:
arguments = {
    "seed": seed
}

%time SingleColumnAllMissingEvaluator(task, 0.5, "MCAR", target_columns[-1], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V15 - Necessary train time in seconds: 0.0009
                train      test
F1_micro     0.120567  0.114286
F1_macro     0.015002  0.019373
F1_weighted  0.026938  0.023443

             baseline  corrupted   imputed
F1_micro     0.729038   0.701971  0.701971
F1_macro     0.443047   0.432568  0.432568
F1_weighted  0.732119   0.704395  0.704395


CPU times: user 426 ms, sys: 4.85 ms, total: 431 ms
Wall time: 431 ms


In [8]:
arguments = {
    "seed": seed
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns[2:], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 2 target columns: V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V8 - Necessary train time in seconds: 0.001
                train      test
F1_micro     0.787234  0.761905
F1_macro     0.440313  0.431796
F1_weighted  0.693841  0.660217

             baseline  corrupted   imputed
F1_micro     0.729038   0.701971  0.701971
F1_macro     0.443047   0.432568  0.432568
F1_weighted  0.732119   0.704395  0.704395


Target Column: V15 - Necessary train time in seconds: 0.001
                train      test
F1_micro     0.118203  0.085714
F1_macro     0.014738  0.013557
F1_weighted  0.025295  0.013534

             baseline  corrupted   imputed
F1_micro     0.729038   0.700206  0.700206
F1_macro     0.443047   0.420314  0.420314
F1_weighted  0.732119   0.702995  0.702995


CPU times: user 916 ms, sys: 13.3 ms, total: 929 ms
Wall time: 932 ms


## KNN imputation

In [9]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_neighbors": [3, 5]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_neighbors": [3, 5]
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, KNNImputer, arguments).evaluate(3).report()

Evaluation result contains 4 target columns: V2, V4, V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 0.6004
            train         test
MAE     23.897400    30.668571
MSE   1048.574657  1671.436571
RMSE    32.364484    40.819010

             baseline  corrupted   imputed
F1_micro     0.729038   0.701971  0.701677
F1_macro     0.443047   0.432568  0.432445
F1_weighted  0.732119   0.704395  0.704211


Target Column: V4 - Necessary train time in seconds: 0.6781
                train      test
F1_micro     0.964539  0.933333
F1_macro     0.926341  0.815358
F1_weighted  0.963794  0.926863

             baseline  corrupted   imputed
F1_micro     0.729038   0.700206  0.701089
F1_macro     0.443047   0.420314  0.422655
F1_weighted  0.732119   0.702995  0.704136


Target Column: V8 - Necessary train time in seconds: 0.5943
                train      test
F1_micro     0.962175  0.952381
F1_macro     0.944

## Forest imputation

In [10]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_estimators": [50, 100]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_estimators": [50, 100]
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, ForestImputer, arguments).evaluate(3).report()

Evaluation result contains 4 target columns: V2, V4, V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 4.1498
           train         test
MAE    18.222446    31.471091
MSE   704.123501  1515.773382
RMSE   26.510395    38.788418

             baseline  corrupted   imputed
F1_micro     0.729038   0.702265  0.701971
F1_macro     0.443047   0.427093  0.426165
F1_weighted  0.732119   0.703904  0.703498


Target Column: V4 - Necessary train time in seconds: 0.9897
                train      test
F1_micro     0.983452  0.961905
F1_macro     0.946292  0.865853
F1_weighted  0.984611  0.970422

             baseline  corrupted   imputed
F1_micro     0.729038   0.700206  0.700794
F1_macro     0.443047   0.423017  0.423775
F1_weighted  0.732119   0.702965  0.703729


Target Column: V8 - Necessary train time in seconds: 0.9636
                train      test
F1_micro     0.985816  0.961905
F1_macro     0.979314 

## AutoKeras imputation

In [11]:
arguments = {
    "seed": seed,
    'max_trials': 2,
    'tuner': 'greedy',
    'validation_split': 0.1,
    'epochs': 2
}

%time SingleColumnAllMissingEvaluator(task, 0.2, "MNAR", target_columns[1], AutoKerasImputer, arguments).evaluate(3).report()

Trial 2 Complete [00h 00m 02s]
val_accuracy: 0.831447958946228

Best val_accuracy So Far: 0.831447958946228
Total elapsed time: 00h 00m 04s
Epoch 1/2
Epoch 2/2
Evaluation result contains 1 target columns: V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V4 - Necessary train time in seconds: 10.31
                train      test
F1_micro     0.976190  0.642857
F1_macro     0.660578  0.493827
F1_weighted  0.987823  0.654321

             baseline  corrupted   imputed
F1_micro     0.729038   0.715799  0.715799
F1_macro     0.443047   0.432783  0.432779
F1_weighted  0.732119   0.717499  0.717520


CPU times: user 38.2 s, sys: 2.72 s, total: 40.9 s
Wall time: 36.3 s


## VAE imputation

In [12]:
arguments = {
    "seed": seed,
    'max_trials': 2,
    'tuner': None,
    'validation_split': 0.1,
    'epochs': 2
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, AutoKerasImputer, arguments).evaluate(3).report()

Trial 2 Complete [00h 00m 02s]
val_accuracy: 0.43325790762901306

Best val_accuracy So Far: 0.5124434232711792
Total elapsed time: 00h 00m 05s
Epoch 1/2
Epoch 2/2
Evaluation result contains 4 target columns: V2, V4, V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 9.0171
            train         test
MAE     27.895165    31.321839
MSE   1205.543670  1552.634940
RMSE    34.717389    39.317011

             baseline  corrupted   imputed
F1_micro     0.729038   0.701971  0.701677
F1_macro     0.443047   0.432568  0.432445
F1_weighted  0.732119   0.704395  0.704211


Target Column: V4 - Necessary train time in seconds: 10.1188
                train      test
F1_micro     0.933806  0.904762
F1_macro     0.870755  0.761982
F1_weighted  0.934826  0.900628

             baseline  corrupted   imputed
F1_micro     0.729038   0.700500  0.700500
F1_macro     0.443047   0.419817  0.420398
F1_weighted  0.732119  

## GAIN imputation

In [13]:
arguments = {
    "seed": seed,
    "hyperparameter_grid": {
        "gain": {
            "alpha": [50, 100],
            "hint_rate": [0.7, 0.9],
        },
        "training": {
            "epochs": [5, 15]
        }
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, GAINImputer, arguments).evaluate(3).report()

Evaluation result contains 4 target columns: V2, V4, V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 64.1294
            train         test
MAE     36.224380    36.531204
MSE   2085.301528  1973.905786
RMSE    45.543494    44.367411

             baseline  corrupted   imputed
F1_micro     0.729038   0.697852  0.697852
F1_macro     0.443047   0.418050  0.421731
F1_weighted  0.732119   0.700929  0.700929


Target Column: V4 - Necessary train time in seconds: 68.0043
                train      test
F1_micro     0.839243  0.819048
F1_macro     0.456177  0.450231
F1_weighted  0.766133  0.737632

             baseline  corrupted   imputed
F1_micro     0.729038   0.700794  0.700794
F1_macro     0.443047   0.412708  0.412708
F1_weighted  0.732119   0.701845  0.701845


Target Column: V8 - Necessary train time in seconds: 73.4543
                train      test
F1_micro     0.780142  0.761905
F1_macro     0.