In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLTask(seed=seed, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## Mode Imputation

In [5]:
arguments = {
    "seed": seed
}

%time Evaluator(task, missing_values, ModeImputer, arguments).evaluate(10).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     29.837246    30.403368
MSE   1384.426758  1460.703305
RMSE    37.207886    38.219148

Target Column: V4
                train      test
F1_micro     0.819064  0.840989
F1_macro     0.450267  0.456814
F1_weighted  0.737595  0.768351

Target Column: V15
                train      test
F1_micro     0.120918  0.125442
F1_macro     0.013484  0.013932
F1_weighted  0.026088  0.027963

CPU times: user 2.33 s, sys: 27.8 ms, total: 2.36 s
Wall time: 2.36 s


## KNN imputation

In [6]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_neighbors": [3, 5]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_neighbors": [3, 5]
    }
}

%time Evaluator(task, missing_values, KNNImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     30.943248    31.103887
MSE   1508.708826  1561.592226
RMSE    38.842101    39.516987

Target Column: V4
                train      test
F1_micro     0.924095  0.929329
F1_macro     0.863178  0.865443
F1_weighted  0.921436  0.928673

Target Column: V15
                train      test
F1_micro     0.488085  0.528269
F1_macro     0.395713  0.441334
F1_weighted  0.481903  0.521591

CPU times: user 11.4 s, sys: 2.77 s, total: 14.1 s
Wall time: 11.6 s


## Forest imputation

In [7]:
arguments = {
    "seed": seed,
    "hyperparameter_grid_categorical_imputer": {
        "n_estimators": [50, 100]
    },
    "hyperparameter_grid_numerical_imputer": {
        "n_estimators": [50, 100]
    }
}

%time Evaluator(task, missing_values, ForestImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     27.974878    28.399103
MSE   1254.706000  1293.592315
RMSE    35.419423    35.966237

Target Column: V4
                train      test
F1_micro     0.945896  0.955477
F1_macro     0.901058  0.916465
F1_weighted  0.944101  0.954554

Target Column: V15
                train      test
F1_micro     0.528420  0.574912
F1_macro     0.436174  0.477179
F1_weighted  0.524181  0.569348

CPU times: user 38 s, sys: 1.25 s, total: 39.3 s
Wall time: 30.6 s


## GAIN imputation

In [8]:
arguments = {
    "num_data_columns": task.train_data.shape[1],
    "seed": seed,
    "hyperparameter_grid": {
        "gain": {
            "alpha": [80, 120],
            "hint_rate": [0.5, 0.9],
            "noise": [0.001, 0.1]
        },
        "training": {
            "batch_size": [64, 256],
            "epochs": [5, 15]
        }
    }
}

%time Evaluator(task, missing_values, GAINImputer, arguments).evaluate(3).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     31.898151    31.497310
MSE   1725.601505  1628.960444
RMSE    41.525550    40.296772

Target Column: V4
                train      test
F1_micro     0.853486  0.849823
F1_macro     0.626829  0.628170
F1_weighted  0.817550  0.811782

Target Column: V15
                train      test
F1_micro     0.050750  0.067138
F1_macro     0.021622  0.031871
F1_weighted  0.016197  0.021116

CPU times: user 14min 1s, sys: 2min 28s, total: 16min 29s
Wall time: 10min 53s
