In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation.simple import ModeImputer
from data_imputation_paper.imputation.ml import KNNImputer, ForestImputer
from data_imputation_paper.imputation.generative import GAINImputer
from data_imputation_paper.evaluation import Evaluator

## Make Deterministic

In [2]:
seed = 42

## Create example tasks

In [3]:
task = OpenMLTask(seed=seed, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouldn't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
numerical_missing = MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR')
categorical_missing = MissingValues(column='V15', fraction=0.5, na_value=np.nan, missingness='MCAR')

## Create Evaluator

Evaluators repeadetly:
1. insert missing values into the dataset
2. fit the imputer
3. evauluate the train and test performance of the imputation

Then it returns the mean evaluation result.

## Mode Imputation

In [5]:
Evaluator(task, numerical_missing, ModeImputer(seed=seed)).evaluate(10).result

100%|██████████| 10/10 [00:00<00:00, 21.56it/s]


Unnamed: 0,train,test
MAE,29.757084,28.869137
MSE,1397.18498,1322.106566
RMSE,37.374339,36.350029


In [6]:
Evaluator(task, categorical_missing, ModeImputer(seed=seed)).evaluate(10).result

100%|██████████| 10/10 [00:00<00:00, 12.36it/s]


Unnamed: 0,train,test
F1_micro,0.116284,0.116608
F1_macro,0.01302,0.013137
F1_weighted,0.024244,0.024632


## KNN imputation

In [7]:
Evaluator(task, numerical_missing, KNNImputer(seed=seed)).evaluate(5).result

100%|██████████| 5/5 [00:03<00:00,  1.28it/s]


Unnamed: 0,train,test
MAE,29.534634,28.638587
MSE,1368.808681,1320.02106
RMSE,36.996664,36.32218


In [8]:
Evaluator(task, categorical_missing, KNNImputer(seed=seed)).evaluate(5).result

100%|██████████| 5/5 [00:02<00:00,  2.47it/s]


Unnamed: 0,train,test
F1_micro,0.433363,0.422615
F1_macro,0.339083,0.324596
F1_weighted,0.424716,0.413473


## Forest imputation

In [9]:
Evaluator(task, numerical_missing, ForestImputer(seed=seed)).evaluate(5).result

100%|██████████| 5/5 [00:03<00:00,  1.57it/s]


Unnamed: 0,train,test
MAE,28.43751,27.693932
MSE,1275.207265,1240.503221
RMSE,35.704974,35.212303


In [10]:
Evaluator(task, categorical_missing, ForestImputer(seed=seed)).evaluate(5).result

100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


Unnamed: 0,train,test
F1_micro,0.548985,0.534982
F1_macro,0.45624,0.434275
F1_weighted,0.545328,0.529917


## GAIN imputation

In [11]:
Evaluator(task, numerical_missing, GAINImputer(task.train_data.shape[1], {}, seed=seed)).evaluate(5).result

100%|██████████| 5/5 [00:23<00:00,  4.61s/it]


Unnamed: 0,train,test
MAE,30.422496,30.198055
MSE,1455.489499,1434.4856
RMSE,38.136465,37.868507


In [12]:
Evaluator(task, categorical_missing, GAINImputer(task.train_data.shape[1], {}, seed=seed)).evaluate(5).result

100%|██████████| 5/5 [00:23<00:00,  4.80s/it]


Unnamed: 0,train,test
F1_micro,0.093822,0.093993
F1_macro,0.030916,0.031007
F1_weighted,0.040712,0.042787
