In [1]:
from jenga.tasks.openml import OpenMLBinaryClassificationTask, OpenMLMultiClassClassificationTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLMultiClassClassificationTask(openml_id=4552, seed=seed)

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## Mode Imputation

In [5]:
arguments = {
    "seed": seed
}

%time Evaluator(task, missing_values, ModeImputer, arguments).evaluate(10).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     30.041142    28.867716
MSE   1444.706256  1245.957937
RMSE    38.008059    35.292878

Target Column: V4
                train      test
F1_micro     0.828199  0.841696
F1_macro     0.453008  0.457008
F1_weighted  0.750384  0.769375

Target Column: V15
                train      test
F1_micro     0.121756  0.112014
F1_macro     0.013566  0.012749
F1_weighted  0.026462  0.022708

CPU times: user 2.2 s, sys: 24.2 ms, total: 2.22 s
Wall time: 2.22 s


## KNN imputation

In [6]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_neighbors": [3, 5]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_neighbors": [3, 5]
    }
}

%time Evaluator(task, missing_values, KNNImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     31.184731    29.788198
MSE   1549.078627  1389.964961
RMSE    39.356120    37.282137

Target Column: V4
                train      test
F1_micro     0.935922  0.942403
F1_macro     0.881156  0.881266
F1_weighted  0.934604  0.941230

Target Column: V15
                train      test
F1_micro     0.510591  0.525442
F1_macro     0.417909  0.414359
F1_weighted  0.503308  0.514905

CPU times: user 11 s, sys: 2.72 s, total: 13.8 s
Wall time: 12 s


## Forest imputation

In [7]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_estimators": [50, 100]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_estimators": [50, 100]
    }
}

%time Evaluator(task, missing_values, ForestImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     28.215426    26.842464
MSE   1276.326227  1138.658181
RMSE    35.720128    33.736769

Target Column: V4
                train      test
F1_micro     0.953045  0.955124
F1_macro     0.911836  0.911809
F1_weighted  0.951996  0.954757

Target Column: V15
                train      test
F1_micro     0.563195  0.553710
F1_macro     0.475594  0.436930
F1_weighted  0.558138  0.547215

CPU times: user 43.6 s, sys: 1.2 s, total: 44.8 s
Wall time: 32.1 s


## GAIN imputation

In [5]:
arguments = {
    "num_data_columns": task.train_data.shape[1],
    "seed": seed,
    "hyperparameter_grid": {
        "gain": {
            "alpha": [80, 120],
            "hint_rate": [0.5, 0.9],
            "noise": [0.001, 0.1]
        },
        "training": {
            "batch_size": [64, 256],
            "epochs": [5, 15]
        }
    }
}

%time Evaluator(task, missing_values, GAINImputer, arguments).evaluate(2).report()

[33m[W 2021-02-15 18:14:21,014][0m Trial 0 failed, because the objective function returned nan.[0m


ValueError: No trials are completed yet.