In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLTask(seed=seed, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## Mode Imputation

In [5]:
arguments = {"seed": seed}
Evaluator(task, missing_values, ModeImputer, arguments).evaluate(10).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     29.554985    30.403800
MSE   1413.742635  1506.081031
RMSE    37.599769    38.808260

Target Column: V4
                train      test
F1_micro     0.840688  0.849823
F1_macro     0.456725  0.459408
F1_weighted  0.767927  0.780831

Target Column: V15
                train      test
F1_micro     0.112533  0.127208
F1_macro     0.012644  0.014107
F1_weighted  0.022766  0.028712



## KNN imputation

In [6]:
arguments = {"seed": seed}
Evaluator(task, missing_values, KNNImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     30.389409    32.458657
MSE   1524.339806  1729.086643
RMSE    39.042795    41.582288

Target Column: V4
                train      test
F1_micro     0.938658  0.936396
F1_macro     0.881044  0.872933
F1_weighted  0.937453  0.935761

Target Column: V15
                train      test
F1_micro     0.518535  0.485866
F1_macro     0.432207  0.384204
F1_weighted  0.511027  0.473731



## Forest imputation

In [7]:
arguments = {"seed": seed}
Evaluator(task, missing_values, ForestImputer, arguments).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     28.093496    29.143869
MSE   1286.917883  1351.793330
RMSE    35.873530    36.765395

Target Column: V4
                train      test
F1_micro     0.953663  0.946996
F1_macro     0.913213  0.897166
F1_weighted  0.952800  0.945985

Target Column: V15
                train      test
F1_micro     0.552074  0.544170
F1_macro     0.462944  0.444690
F1_weighted  0.544525  0.536913



## GAIN imputation

In [8]:
arguments = {
    "num_data_columns": task.train_data.shape[1],
    "seed": seed
}
Evaluator(task, missing_values, GAINImputer, arguments).evaluate(3).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     33.522245    33.531434
MSE   1819.102196  1898.661046
RMSE    42.633235    43.573267

Target Column: V4
                train      test
F1_micro     0.849515  0.858657
F1_macro     0.603442  0.652388
F1_weighted  0.811604  0.826487

Target Column: V15
                train      test
F1_micro     0.062665  0.088339
F1_macro     0.039868  0.052819
F1_weighted  0.055257  0.078645

