In [1]:
from jenga.tasks.openml import OpenMLBinaryClassificationTask, OpenMLMultiClassClassificationTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.generative import VAEImputer
from data_imputation_paper.evaluation import Evaluator

%load_ext autoreload
%autoreload 2

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLMultiClassClassificationTask(openml_id=4552, seed=seed)

## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## VAE imputation

In [5]:
arguments = {
    "seed": seed,
    "hyperparameter_grid": {
        "training": {
            "batch_size": [64, 256],
            "epochs": [5, 15]
        }
    }
}

%time Evaluator(task, missing_values, VAEImputer, arguments).evaluate(2).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     34.490946    34.349771
MSE   2128.888811  2131.283890
RMSE    45.966730    45.969173

Target Column: V4
                train      test
F1_micro     0.826346  0.838339
F1_macro     0.452458  0.456016
F1_weighted  0.747776  0.764647

Target Column: V15
                train      test
F1_micro     0.062665  0.063604
F1_macro     0.023002  0.022782
F1_weighted  0.026904  0.030857

CPU times: user 2min 12s, sys: 30.9 s, total: 2min 42s
Wall time: 1min 1s
