In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
np.random.seed(42)

## Create example tasks

In [3]:
task = OpenMLTask(seed=42, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
numerical_missing = MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR')
categorical_missing = MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')

## Create Evaluator

Evaluators repeadetly:
1. insert missing values into the dataset
2. fit the imputer
3. evauluate the train and test performance of the imputation

Then it returns the mean evaluation result.

## Mode Imputation

In [5]:
Evaluator(task, numerical_missing, ModeImputer()).evaluate(10).result

100%|██████████| 10/10 [00:00<00:00, 20.97it/s]


Unnamed: 0,train,test
MAE,14.924881,14.53182
MSE,712.367137,642.044099
RMSE,26.686727,25.333848


In [6]:
Evaluator(task, categorical_missing, ModeImputer()).evaluate(10).result

100%|██████████| 10/10 [00:01<00:00,  8.56it/s]


Unnamed: 0,train,test
F1_micro,0.562489,0.548014
F1_macro,0.647225,0.63112
F1_weighted,0.627793,0.624847


## KNN imputation

In [7]:
Evaluator(task, numerical_missing, KNNImputer()).evaluate(5).result

100%|██████████| 5/5 [00:07<00:00,  1.56s/it]


Unnamed: 0,train,test
MAE,12.161624,14.661147
MSE,475.178058,671.830178
RMSE,21.797059,25.909427


In [8]:
Evaluator(task, categorical_missing, KNNImputer()).evaluate(5).result

100%|██████████| 5/5 [00:05<00:00,  1.14s/it]


Unnamed: 0,train,test
F1_micro,0.851192,0.777405
F1_macro,0.824286,0.747663
F1_weighted,0.850853,0.77711


## Forest imputation

In [9]:
Evaluator(task, numerical_missing, ForestImputer()).evaluate(5).result

100%|██████████| 5/5 [00:35<00:00,  7.01s/it]


Unnamed: 0,train,test
MAE,7.553982,13.497333
MSE,211.850108,555.3201
RMSE,14.55475,23.560234


In [10]:
Evaluator(task, categorical_missing, ForestImputer()).evaluate(5).result

100%|██████████| 5/5 [00:09<00:00,  1.90s/it]


Unnamed: 0,train,test
F1_micro,1.0,0.799823
F1_macro,1.0,0.752065
F1_weighted,1.0,0.798613


## GAIN imputation

In [11]:
Evaluator(task, numerical_missing, GAINImputer(task.train_data.shape[1], {})).evaluate(5).result

100%|██████████| 5/5 [00:23<00:00,  4.73s/it]


Unnamed: 0,train,test
MAE,20.293379,19.270836
MSE,1441.331601,1277.050672
RMSE,37.963308,35.727391


In [12]:
Evaluator(task, categorical_missing, GAINImputer(task.train_data.shape[1], {})).evaluate(5).result

100%|██████████| 5/5 [00:23<00:00,  4.69s/it]


Unnamed: 0,train,test
F1_micro,0.504501,0.503266
F1_macro,0.604533,0.59984
F1_weighted,0.653611,0.655279
