In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLTask(seed=seed, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## Mode Imputation

In [5]:
Evaluator(task, missing_values, ModeImputer(seed=seed)).evaluate(10).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     29.644429    28.885080
MSE   1405.066363  1327.984169
RMSE    37.480881    36.429985

Target Column: V4
                train      test
F1_micro     0.825243  0.845760
F1_macro     0.452124  0.458207
F1_weighted  0.746237  0.775105

Target Column: V15
                train      test
F1_micro     0.117432  0.116431
F1_macro     0.013135  0.013025
F1_weighted  0.024701  0.024463



## KNN imputation

In [6]:
Evaluator(task, missing_values, KNNImputer(seed=seed)).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     30.905878    29.830459
MSE   1524.203410  1430.744608
RMSE    39.037683    37.815496

Target Column: V4
                train      test
F1_micro     0.929656  0.944170
F1_macro     0.870220  0.891768
F1_weighted  0.927796  0.943277

Target Column: V15
                train      test
F1_micro     0.524537  0.515548
F1_macro     0.426727  0.419469
F1_weighted  0.518370  0.508862



## Forest imputation

In [7]:
Evaluator(task, missing_values, ForestImputer(seed=seed)).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     27.985317    27.595843
MSE   1248.860013  1234.804603
RMSE    35.336940    35.130570

Target Column: V4
                train      test
F1_micro     0.948279  0.961484
F1_macro     0.906490  0.924909
F1_weighted  0.947228  0.960232

Target Column: V15
                train      test
F1_micro     0.560812  0.566078
F1_macro     0.469335  0.474492
F1_weighted  0.556048  0.561442



## GAIN imputation

In [5]:
Evaluator(task, missing_values, GAINImputer(task.train_data.shape[1], {}, seed=seed)).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     34.163356    34.117636
MSE   1910.745171  1881.236243
RMSE    43.709042    43.353753

Target Column: V4
                train      test
F1_micro     0.877670  0.878445
F1_macro     0.698892  0.687975
F1_weighted  0.853193  0.852072

Target Column: V15
                train      test
F1_micro     0.059753  0.072792
F1_macro     0.041935  0.051820
F1_weighted  0.052225  0.061596

