In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLTask(seed=42, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
missing_values = [
    MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR'),
    MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')
]

## Create Evaluator

## Mode Imputation

In [5]:
Evaluator(task, missing_values, ModeImputer(seed=seed)).evaluate(10).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
           train        test
MAE    16.887964   16.843340
MSE   785.731773  779.331354
RMSE   28.028850   27.910313

Target Column: V4
                train      test
F1_micro     0.903796  0.900226
F1_macro     0.772001  0.773260
F1_weighted  0.887095  0.883218

Target Column: V15
                train      test
F1_micro     0.496484  0.495812
F1_macro     0.574888  0.572334
F1_weighted  0.564149  0.566652



## KNN imputation

In [6]:
Evaluator(task, missing_values, KNNImputer(seed=seed)).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     30.327096    30.649187
MSE   1450.408523  1457.419845
RMSE    38.082157    38.161743

Target Column: V4
                train      test
F1_micro     0.936275  0.949823
F1_macro     0.877481  0.906966
F1_weighted  0.934614  0.948433

Target Column: V15
                train      test
F1_micro     0.506973  0.535336
F1_macro     0.405233  0.441314
F1_weighted  0.498939  0.529527



## Forest imputation

In [7]:
Evaluator(task, missing_values, ForestImputer(seed=seed)).evaluate(5).report()

Evaluation result contains 3 target columns: V2, V4, V15
All are in a round-robin fashion imputed and performances are as follows:

Target Column: V2
            train         test
MAE     27.750770    27.639080
MSE   1231.435511  1226.041531
RMSE    35.089002    35.009387

Target Column: V4
                train      test
F1_micro     0.950132  0.964664
F1_macro     0.903445  0.934303
F1_weighted  0.949048  0.963818

Target Column: V15
                train      test
F1_micro     0.550309  0.578445
F1_macro     0.468850  0.491315
F1_weighted  0.546476  0.572767



## GAIN imputation

In [11]:
Evaluator(task, numerical_missing, GAINImputer(task.train_data.shape[1], {})).evaluate(5).result

100%|██████████| 5/5 [00:23<00:00,  4.73s/it]


Unnamed: 0,train,test
MAE,20.293379,19.270836
MSE,1441.331601,1277.050672
RMSE,37.963308,35.727391


In [12]:
Evaluator(task, categorical_missing, GAINImputer(task.train_data.shape[1], {})).evaluate(5).result

100%|██████████| 5/5 [00:23<00:00,  4.69s/it]


Unnamed: 0,train,test
F1_micro,0.504501,0.503266
F1_macro,0.604533,0.59984
F1_weighted,0.653611,0.655279
