In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues

import pandas as pd
import numpy as np

from data_imputation_paper.imputation import SKLearnModeImputer
from data_imputation_paper.evaluation import Evaluator

## Make determenistic

In [2]:
np.random.seed(42)

## Create example tasks

In [3]:
categorical_task = OpenMLTask(seed=42, openml_id=4135)
numerical_task = OpenMLTask(seed=42, openml_id=1471)

if categorical_task.contains_missing_values() or numerical_task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouln't have a full ground truth.")

Found 9 categorical columns: ['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE']
Found 0 numeric columns: []
Found 0 categorical columns: []
Found 14 numeric columns: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14']


## Insert missing values using jenga

In [4]:
V1_missing = MissingValues(column='V1', fraction=0.5, na_value=np.nan, missingness='MCAR')
role_family_missing = MissingValues(column='ROLE_FAMILY', fraction=0.5, na_value=np.nan, missingness='MCAR')

## Create Imputer

In [5]:
imputer = SKLearnModeImputer()

## Create Evaluator

Evaluators repeadetly:
1. insert missing values into the dataset
2. fit the imputer
3. evauluate the train and test performance of the imputation

Then it returns the mean evaluation result.

In [6]:
evaluator_numerical_task = Evaluator(numerical_task, V1_missing, imputer)

In [7]:
numerical_result = evaluator_numerical_task.evaluate(10, fit_kwargs={"verbose": 1}, transform_kwargs={})  # we can feed named arguments to the imputers fit and transform interface

100%|██████████| 10/10 [00:00<00:00, 66.50it/s]


In [8]:
numerical_result.result

Unnamed: 0,train,test
MAE,32.69029,20.454129
MSE,3880758.0,3596.661244
RMSE,1413.556,57.60863


In [9]:
evaluator_categorical_task = Evaluator(categorical_task, role_family_missing, imputer)

In [10]:
categorical_result = evaluator_categorical_task.evaluate(10)

100%|██████████| 10/10 [00:07<00:00,  1.30it/s]


In [11]:
categorical_result.result

Unnamed: 0,train,test
F1_micro,0.668873,0.664434
F1_macro,0.650679,0.634385
F1_weighted,0.667989,0.664097
