In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
np.random.seed(42)

## Create example tasks

In [3]:
task = OpenMLTask(seed=42, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
numerical_missing = MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR')
categorical_missing = MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR')

## Create Evaluator

Evaluators repeadetly:
1. insert missing values into the dataset
2. fit the imputer
3. evauluate the train and test performance of the imputation

Then it returns the mean evaluation result.

## Mode Imputation

In [5]:
Evaluator(task, numerical_missing, ModeImputer()).evaluate(10).result

100%|██████████| 10/10 [00:01<00:00,  8.77it/s]


Unnamed: 0,train,test
MAE,14.924881,14.53182
MSE,712.367137,642.044099
RMSE,26.686727,25.333848


In [6]:
Evaluator(task, categorical_missing, ModeImputer()).evaluate(10).result

100%|██████████| 10/10 [00:02<00:00,  4.66it/s]


Unnamed: 0,train,test
F1_micro,0.914497,0.922154
F1_macro,0.808785,0.814134
F1_weighted,0.902334,0.911054


## KNN imputation

In [7]:
Evaluator(task, numerical_missing, KNNImputer()).evaluate(5).result

100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Unnamed: 0,train,test
MAE,12.161624,14.661147
MSE,475.178058,671.830178
RMSE,21.797059,25.909427


In [8]:
Evaluator(task, categorical_missing, KNNImputer()).evaluate(5).result

100%|██████████| 5/5 [00:10<00:00,  2.16s/it]


Unnamed: 0,train,test
F1_micro,0.98301,0.975463
F1_macro,0.969736,0.953387
F1_weighted,0.982924,0.975329


## Forest imputation

In [9]:
Evaluator(task, numerical_missing, ForestImputer()).evaluate(5).result

100%|██████████| 5/5 [01:51<00:00, 22.29s/it]


Unnamed: 0,train,test
MAE,7.553982,13.497333
MSE,211.850108,555.3201
RMSE,14.55475,23.560234


In [10]:
Evaluator(task, categorical_missing, ForestImputer()).evaluate(5).result

100%|██████████| 5/5 [00:13<00:00,  2.73s/it]


Unnamed: 0,train,test
F1_micro,1.0,0.983054
F1_macro,1.0,0.967882
F1_weighted,1.0,0.982981
