In [1]:
from jenga.tasks.openml import OpenMLBinaryClassificationTask, OpenMLMultiClassClassificationTask, OpenMLRegressionTask
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.dl import AutoKerasImputer
from data_imputation_paper.imputation.generative import GAINImputer, VAEImputer
from data_imputation_paper.evaluation import SingleColumnEvaluator, MultipleColumnsEvaluator, SingleColumnAllMissingEvaluator, MultipleColumnsAllMissingEvaluator

%load_ext autoreload
%autoreload 2

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLMultiClassClassificationTask(openml_id=4552, seed=seed)

## Insert missing values using jenga

In [4]:
target_columns = ['V2', 'V4', 'V8', 'V15']

## Mode Imputation

In [None]:
arguments = {
    "seed": seed
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[0], ModeImputer, arguments).evaluate(3).report()

In [None]:
arguments = {
    "seed": seed
}

%time MultipleColumnsEvaluator(task, 0.5, "MCAR", target_columns[:2], ModeImputer, arguments).evaluate(3).report()

In [None]:
arguments = {
    "seed": seed
}

%time SingleColumnAllMissingEvaluator(task, 0.5, "MCAR", target_columns[-1], ModeImputer, arguments).evaluate(3).report()

In [None]:
arguments = {
    "seed": seed
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns[2:], ModeImputer, arguments).evaluate(3).report()

## KNN imputation

In [None]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_neighbors": [3, 5]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_neighbors": [3, 5]
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, KNNImputer, arguments).evaluate(3).report()

## Forest imputation

In [42]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_estimators": [10, 50, 100]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_estimators": [10, 50, 100]
    }
}

%time MultipleColumnsAllMissingEvaluator(task, 0.5, "MCAR", target_columns, ForestImputer, arguments)

Evaluation result contains 4 target columns: V2, V4, V8, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2 - Necessary train time in seconds: 8.1952
           train         test
MAE    19.198786    30.260443
MSE   854.300856  1516.824269
RMSE   28.924081    38.789983

             baseline  corrupted   imputed
F1_micro     0.737864   0.710503  0.710797
F1_macro     0.355802   0.328961  0.329345
F1_weighted  0.722304   0.694288  0.694682


Target Column: V4 - Necessary train time in seconds: 1.7107
                train      test
F1_micro     0.988180  0.942857
F1_macro     0.980633  0.883160
F1_weighted  0.988334  0.941274

             baseline  corrupted   imputed
F1_micro     0.737864   0.714916  0.714034
F1_macro     0.355802   0.340967  0.340999
F1_weighted  0.722304   0.699270  0.698517


Target Column: V8 - Necessary train time in seconds: 1.8287
                train      test
F1_micro     0.981087  0.933333
F1_macro     0.970960 

[{'V4': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': None,
   'max_features': 'auto',
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_impurity_split': None,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'n_estimators': 50,
   'n_jobs': -1,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
   'warm_start': False}},
 {'V4': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': None,
   'max_features': 'auto',
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_impurity_split': None,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'n_estimators': 50,
   'n_jobs': -1,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
   'warm_start': False}},
 {'V4': {'bootstrap': True,
   'ccp_

## AutoKeras imputation

In [None]:
arguments = {
    "seed": seed,
    'max_trials': 2,
    'tuner': 'greedy',
    'validation_split': 0.1,
    'epochs': 2
}

%time SingleColumnAllMissingEvaluator(task, 0.2, "MNAR", target_columns[1], AutoKerasImputer, arguments).evaluate(3).report()

## VAE imputation

In [9]:
arguments = {
    "seed": seed,
    "hyperparameter_grid": {
        "optimizer": {
            "learning_rate": [0.01]
        }
    }
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[1], VAEImputer, arguments).evaluate(3).report()

Evaluation result contains 1 target columns: V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V4 - Necessary train time in seconds: 20.8341
                train      test
F1_micro     0.830685  0.845112
F1_macro     0.604045  0.594036
F1_weighted  0.799612  0.813892

             baseline  corrupted   imputed
F1_micro     0.737864   0.728155  0.724625
F1_macro     0.355802   0.340454  0.338239
F1_weighted  0.722304   0.709734  0.707290


CPU times: user 1min 20s, sys: 7.19 s, total: 1min 27s
Wall time: 1min 3s


## GAIN imputation

In [6]:
arguments = {
    "seed": seed,
    "hyperparameter_grid": {
        "gain": {
            "alpha": [0.1, 1, 10],
            "hint_rate": [0.7, 0.9],
        },
        "generator": {
            "learning_rate": [0.01],
        },
        "discriminator": {
            "learning_rate": [0.001],
        }
    }
}

%time SingleColumnEvaluator(task, 0.5, "MCAR", target_columns[1], GAINImputer, arguments).evaluate(3).report()

[33m[W 2021-03-17 10:26:39,663][0m Trial 0 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:42,063][0m Trial 1 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:43,845][0m Trial 2 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:45,734][0m Trial 3 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:47,416][0m Trial 4 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:49,564][0m Trial 5 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:51,991][0m Trial 0 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:54,363][0m Trial 1 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:56,221][0m Trial 2 failed, because the objective function returned nan.[0m
[33m[W 2021-03-17 10:26:58,104][0m Trial 3 failed, because the objective functio

Evaluation result contains 1 target columns: V4
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V4 - Necessary train time in seconds: 11.9973
             train  test
F1_micro       NaN   NaN
F1_macro       NaN   NaN
F1_weighted    NaN   NaN

             baseline  corrupted   imputed
F1_micro     0.730803   0.705502  0.705502
F1_macro     0.478282   0.451989  0.451989
F1_weighted  0.728001   0.700695  0.700695


CPU times: user 47.6 s, sys: 8.31 s, total: 55.9 s
Wall time: 36.7 s
