In [1]:
from jenga.tasks.openml import OpenMLBinaryClassificationTask, OpenMLMultiClassClassificationTask, OpenMLRegressionTask
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.dl import AutoKerasImputer
from data_imputation_paper.imputation.generative import GAINImputer, VAEImputer
from data_imputation_paper.evaluation import SingleColumnEvaluator, MultipleColumnsEvaluator, SingleColumnAllMissingEvaluator, MultipleColumnsAllMissingEvaluator

%load_ext autoreload
%autoreload 2

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLMultiClassClassificationTask(openml_id=4552, seed=seed)

## Insert missing values using jenga

In [4]:
target_columns = ['V2', 'V4', 'V8', 'V15']

## Mode Imputation

In [5]:
arguments = {
    "seed": seed
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[0], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V2
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 0.0011
            train         test
MAE     29.233595    28.900308
MSE   1338.990191  1375.117794
RMSE    36.589374    37.078149

             baseline  corrupted   imputed
F1_micro     0.714916   0.715799  0.714034
F1_macro     0.424069   0.416123  0.416774
F1_weighted  0.697985   0.698429  0.695847


CPU times: user 2.47 s, sys: 180 ms, total: 2.65 s
Wall time: 4.22 s


In [6]:
arguments = {
    "seed": seed
}

%time MultipleColumnsEvaluator(task, 0.5, "MCAR", target_columns[:2], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 2 target columns: V2, V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 0.001
            train         test
MAE     29.095922    28.527269
MSE   1315.426330  1339.126494
RMSE    36.256054    36.574130

             baseline  corrupted   imputed
F1_micro     0.714916   0.709915  0.707267
F1_macro     0.424069   0.426744  0.424680
F1_weighted  0.697985   0.695453  0.691909


Target Column: V4 - Necessary train time in seconds: 0.0011
                train      test
F1_micro     0.828479  0.818610
F1_macro     0.453096  0.450050
F1_weighted  0.750767  0.737121

             baseline  corrupted   imputed
F1_micro     0.714916   0.710209  0.707855
F1_macro     0.424069   0.413996  0.409783
F1_weighted  0.697985   0.694445  0.689864


CPU times: user 730 ms, sys: 15.2 ms, total: 745 ms
Wall time: 746 ms


In [7]:
arguments = {
    "seed": seed
}

%time SingleColumnAllMissingEvaluator(task, 0.5, "MCAR", target_columns[-1], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V15 - Necessary train time in seconds: 0.0011
                train      test
F1_micro     0.096927  0.104762
F1_macro     0.012295  0.017982
F1_weighted  0.019625  0.020927

             baseline  corrupted   imputed
F1_micro     0.714916   0.703736  0.699618
F1_macro     0.424069   0.403165  0.400577
F1_weighted  0.697985   0.688191  0.684768


CPU times: user 457 ms, sys: 6.8 ms, total: 464 ms
Wall time: 464 ms


In [8]:
arguments = {
    "seed": seed
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns[2:], ModeImputer, arguments).evaluate(3).report()

Evaluation result contains 2 target columns: V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V8 - Necessary train time in seconds: 0.0009
                train      test
F1_micro     0.789598  0.800000
F1_macro     0.441119  0.443803
F1_weighted  0.696957  0.712393

             baseline  corrupted   imputed
F1_micro     0.714916   0.703736  0.700794
F1_macro     0.424069   0.403165  0.405031
F1_weighted  0.697985   0.688191  0.684326


Target Column: V15 - Necessary train time in seconds: 0.001
                train      test
F1_micro     0.115839  0.142857
F1_macro     0.015273  0.024040
F1_weighted  0.024067  0.036444

             baseline  corrupted   imputed
F1_micro     0.714916   0.701971  0.696087
F1_macro     0.424069   0.402340  0.400751
F1_weighted  0.697985   0.688804  0.683104


CPU times: user 898 ms, sys: 11.5 ms, total: 909 ms
Wall time: 910 ms


## KNN imputation

In [9]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_neighbors": [3, 5]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_neighbors": [3, 5]
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, KNNImputer, arguments).evaluate(3).report()

Evaluation result contains 4 target columns: V2, V4, V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 0.7122
           train         test
MAE    23.560757    32.820952
MSE   976.544208  1848.329524
RMSE   31.206823    42.626019

             baseline  corrupted   imputed
F1_micro     0.714916   0.703736  0.703442
F1_macro     0.424069   0.403165  0.404114
F1_weighted  0.697985   0.688191  0.687819


Target Column: V4 - Necessary train time in seconds: 0.6696
                train      test
F1_micro     0.952719  0.914286
F1_macro     0.911372  0.865618
F1_weighted  0.952284  0.909679

             baseline  corrupted   imputed
F1_micro     0.714916   0.701971  0.702560
F1_macro     0.424069   0.402340  0.402102
F1_weighted  0.697985   0.688804  0.689189


Target Column: V8 - Necessary train time in seconds: 0.6555
                train      test
F1_micro     0.959811  0.876190
F1_macro     0.939325 

## Forest imputation

In [10]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_estimators": [10, 50, 100]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_estimators": [10, 50, 100]
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, ForestImputer, arguments)

CPU times: user 83 µs, sys: 0 ns, total: 83 µs
Wall time: 87 µs


<data_imputation_paper.evaluation.MultipleColumnsAllMissingEvaluator at 0x7febb7d5beb0>

## AutoKeras imputation

In [11]:
arguments = {
    "seed": seed,
    'max_trials': 2,
    'tuner': 'greedy',
    'validation_split': 0.1,
    'epochs': 2
}

%time SingleColumnAllMissingEvaluator(task, 0.2, "MNAR", target_columns[1], AutoKerasImputer, arguments).evaluate(3).report()

Trial 2 Complete [00h 00m 02s]
val_accuracy: 0.8529411554336548

Best val_accuracy So Far: 0.8540723919868469
Total elapsed time: 00h 00m 05s
Epoch 1/2
Epoch 2/2
Evaluation result contains 1 target columns: V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V4 - Necessary train time in seconds: 10.3166
             train  test
F1_micro       1.0   1.0
F1_macro       1.0   1.0
F1_weighted    1.0   1.0

             baseline  corrupted   imputed
F1_micro     0.714916   0.713151  0.712857
F1_macro     0.424069   0.421263  0.419422
F1_weighted  0.697985   0.697947  0.697334


CPU times: user 16.5 s, sys: 1.55 s, total: 18 s
Wall time: 16.4 s


## VAE imputation

In [12]:
arguments = {
    "seed": seed,
    "hyperparameter_grid": {
        "optimizer": {
            "learning_rate": [0.01]
        }
    }
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[1], VAEImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V4 - Necessary train time in seconds: 27.4564
                train      test
F1_micro     0.839217  0.826266
F1_macro     0.559599  0.550766
F1_weighted  0.796622  0.778012

             baseline  corrupted   imputed
F1_micro     0.714916   0.697264  0.698147
F1_macro     0.424069   0.412526  0.395049
F1_weighted  0.697985   0.683110  0.679533


CPU times: user 34.9 s, sys: 3.41 s, total: 38.3 s
Wall time: 28.3 s


## GAIN imputation

In [13]:
arguments = {
    "seed": seed,
    "hyperparameter_grid": {
        "gain": {
            "alpha": [0.1, 1, 10],
            "hint_rate": [0.7, 0.9],
        },
        "generator": {
            "learning_rate": [0.0001],
        },
        "discriminator": {
            "learning_rate": [0.00001],
        }
    }
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[1], GAINImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V4 - Necessary train time in seconds: 49.6354
                train      test
F1_micro     0.837011  0.823910
F1_macro     0.489580  0.482837
F1_weighted  0.768530  0.750427

             baseline  corrupted   imputed
F1_micro     0.714916   0.699029  0.700500
F1_macro     0.424069   0.415572  0.381183
F1_weighted  0.697985   0.685549  0.680522


CPU times: user 1min 17s, sys: 22.6 s, total: 1min 40s
Wall time: 50.5 s
