In [1]:
from jenga.tasks.openml import OpenMLBinaryClassificationTask, OpenMLMultiClassClassificationTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.dl import AutoKerasImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLMultiClassClassificationTask(openml_id=4552, seed=seed)

## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## Mode Imputation

In [5]:
arguments = {
    "seed": seed
}

%time Evaluator(task, missing_values, ModeImputer, arguments).evaluate(10).report()

AttributeError: 'numpy.float64' object has no attribute 'get_params'

## KNN imputation

In [6]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_neighbors": [3, 5]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_neighbors": [3, 5]
    }
}

%time Evaluator(task, missing_values, KNNImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     30.917723    29.751590
MSE   1515.781398  1390.057936
RMSE    38.928770    37.280107

Target Column: V4
                train      test
F1_micro     0.935746  0.951943
F1_macro     0.880279  0.905541
F1_weighted  0.934269  0.951198

Target Column: V15
                train      test
F1_micro     0.513857  0.520848
F1_macro     0.426046  0.413273
F1_weighted  0.506835  0.515812

CPU times: user 12.7 s, sys: 2.81 s, total: 15.5 s
Wall time: 12.5 s


## Forest imputation

In [7]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_estimators": [50, 100]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_estimators": [50, 100]
    }
}

%time Evaluator(task, missing_values, ForestImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     27.881242    27.383110
MSE   1240.528647  1184.928448
RMSE    35.219424    34.420308

Target Column: V4
                train      test
F1_micro     0.950750  0.961484
F1_macro     0.910301  0.919274
F1_weighted  0.949747  0.960749

Target Column: V15
                train      test
F1_micro     0.555869  0.561837
F1_macro     0.476510  0.444321
F1_weighted  0.551574  0.554794

CPU times: user 43.2 s, sys: 1.12 s, total: 44.3 s
Wall time: 32.1 s


## AutoKeras imputation

In [8]:
arguments = {
    "seed": seed,
    'max_trials': 2,
    'tuner': 'greedy',
    'validation_split': 0.1,
    'epochs': 2
}

%time Evaluator(task, missing_values, AutoKerasImputer, arguments).evaluate(5).report()

Trial 2 Complete [00h 00m 02s]
val_accuracy: 0.1085972860455513

Best val_accuracy So Far: 0.13574661314487457
Total elapsed time: 00h 00m 03s
Epoch 1/2
Epoch 2/2
















































































































Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     33.284113    32.314233
MSE   1847.754911  1725.609578
RMSE    41.653004    40.215217

Target Column: V4
                train      test
F1_micro     0.850574  0.863604
F1_macro     0.536234  0.536997
F1_weighted  0.788944  0.806193

Target Column: V15
                train      test
F1_micro     0.145455  0.143816
F1_macro     0.060187  0.058988
F1_weighted  0.098985  0.098157

CPU times: user 2min 30s, sys: 7.12 s, total: 2min 37s
Wall time: 2min 24s


## GAIN imputation

In [9]:
arguments = {
    "num_data_columns": task.train_data.shape[1],
    "seed": seed,
    "hyperparameter_grid": {
        "gain": {
            "alpha": [80, 120],
            "hint_rate": [0.5, 0.9],
            "noise": [0.001, 0.1]
        },
        "training": {
            "batch_size": [64, 256],
            "epochs": [5, 15]
        }
    }
}

%time Evaluator(task, missing_values, GAINImputer, arguments).evaluate(2).report()

[33m[W 2021-03-03 15:22:56,780][0m Trial 0 failed, because the objective function returned nan.[0m


ValueError: No trials are completed yet.