In [1]:
from jenga.tasks.openml import OpenMLTask
from jenga.corruptions.generic import MissingValues
    
import pandas as pd
import numpy as np

from data_imputation_paper.imputation import SKLearnModeImputer, SKLearnIterativeImputer
from data_imputation_paper.evaluation import Evaluator

## Make determenistic

In [2]:
np.random.seed(42)

## Create example tasks

In [3]:
task = OpenMLTask(seed=42, openml_id=4552)

if task.contains_missing_values():
    raise ValueError("This would distort the evaluation because we wouln't have a full ground truth.")

Found 14 categorical columns: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15']
Found 2 numeric columns: ['V2', 'V16']


## Insert missing values using jenga

In [4]:
numerical_missing = MissingValues(column='V2', fraction=0.5, na_value=np.nan, missingness='MCAR')
categorical_missing = MissingValues(column='V3', fraction=0.5, na_value=np.nan, missingness='MCAR')
categorical_as_string_missing = MissingValues(column='V4', fraction=0.5, na_value=np.nan, missingness='MCAR')

## Create Evaluator

Evaluators repeadetly:
1. insert missing values into the dataset
2. fit the imputer
3. evauluate the train and test performance of the imputation

Then it returns the mean evaluation result.

## Examples for mode imputation

In [5]:
mode_evaluator_numerical = Evaluator(task, numerical_missing, SKLearnModeImputer())

In [6]:
mode_evaluator_numerical.evaluate(10).result

100%|██████████| 10/10 [00:00<00:00, 58.18it/s]


Unnamed: 0,train,test
MAE,14.924881,14.53182
MSE,712.367137,642.044099
RMSE,26.686727,25.333848


In [7]:
mode_evaluator_categorical = Evaluator(task, categorical_missing, SKLearnModeImputer())

In [8]:
mode_evaluator_categorical.evaluate(10).result

100%|██████████| 10/10 [00:01<00:00,  7.65it/s]


Unnamed: 0,train,test
F1_micro,0.840666,0.840512
F1_macro,0.779786,0.776612
F1_weighted,0.822196,0.821427


In [9]:
mode_evaluator_cat_as_string = Evaluator(task, categorical_as_string_missing, SKLearnModeImputer())

In [10]:
mode_evaluator_cat_as_string.evaluate(10).result

100%|██████████| 10/10 [00:01<00:00,  7.86it/s]


Unnamed: 0,train,test
F1_micro,0.9141,0.922154
F1_macro,0.807635,0.814039
F1_weighted,0.901798,0.911025


## MICE imputation

In [11]:
evaluator_mice_numerical_ordinal = Evaluator(task, numerical_missing, SKLearnIterativeImputer(strategy="mice", data_encoding_type="ordinal"))

In [12]:
evaluator_mice_numerical_ordinal.evaluate(10).result

100%|██████████| 10/10 [00:02<00:00,  3.76it/s]


Unnamed: 0,train,test
MAE,26.857458,25.954016
MSE,2153.424228,1972.734951
RMSE,46.403699,44.409269


In [13]:
evaluator_mice_numerical = Evaluator(task, numerical_missing, SKLearnIterativeImputer(strategy="mice", data_encoding_type="one-hot"))

In [14]:
evaluator_mice_numerical.evaluate(10).result

100%|██████████| 10/10 [00:33<00:00,  3.34s/it]


Unnamed: 0,train,test
MAE,26.870653,26.220388
MSE,2159.661871,2013.49947
RMSE,46.470321,44.868159


## missForest imputation

In [15]:
evaluator_forest_numerical_ordinal = Evaluator(task, numerical_missing, SKLearnIterativeImputer(strategy="missforest", data_encoding_type="ordinal", estimator_args={"n_estimators": 20})) # we can feed arguments to estimators' constructor

In [16]:
evaluator_forest_numerical_ordinal.evaluate(10).result

100%|██████████| 10/10 [00:25<00:00,  2.55s/it]


Unnamed: 0,train,test
MAE,26.94466,26.122771
MSE,2167.929656,2000.316593
RMSE,46.559215,44.714573


In [17]:
# one hot encoding is default
evaluator_forest_numerical = Evaluator(task, numerical_missing, SKLearnIterativeImputer(strategy="missforest", estimator_args={"n_estimators": 20})) # we can feed arguments to estimators' constructor

In [None]:
evaluator_forest_numerical.evaluate(10).result

 70%|███████   | 7/10 [02:31<01:05, 21.72s/it]

## Missing Values in Categorical Columns and `sklearn`s' `IterativImputer`

Using `sklearn`s' `IterativImputer` can't be used for categorical columns at the moment.

**There are two types of categorical values with different difficulties:**
1. Strings: We need to encode these values to process them
2. Numerical: Both estimators (`BayesianRidge` and `RandomForestRegressor`) will treat the imputation problem as regression instead of classification.

**To 1.:** Using `OrdinalEncoder` to encode the column basically shifts this into a column of type 2. Using `OneHotEncoder` introduces the disadvantage that `sklearns` imputer can't find the missing values anymore because they only search for `np.nan` or any other given value (at learst as far as I know).

**To 2.:** Could not find a solution yet ...


With `sklearn` v0.24, which is not publsihed yet, we can use `OrdinalEncoder` with arguments `handle_unknown='use_encoded_value'` and `unknown_value=np.nan` to encode the categories and preserve the missing values.

In [None]:
evaluator_mice_categorical = Evaluator(task, categorical_missing, SKLearnIterativeImputer(strategy="mice"))

In [None]:
evaluator_mice_categorical.evaluate(10).result

In [None]:
evaluator_mice_categorical_ordinal = Evaluator(task, categorical_missing, SKLearnIterativeImputer(strategy="mice", data_encoding_type="ordinal"))

In [None]:
evaluator_mice_categorical_ordinal.evaluate(10).result

In [None]:
evaluator_mice_cat_as_string = Evaluator(task, categorical_as_string_missing, SKLearnIterativeImputer(strategy="mice"))

In [None]:
evaluator_mice_cat_as_string.evaluate(10).result

In [None]:
evaluator_forest_categorical = Evaluator(task, categorical_missing, SKLearnIterativeImputer(strategy="missforest"))

In [None]:
evaluator_forest_categorical.evaluate(10).result

In [None]:
evaluator_forest_cat_as_string = Evaluator(task, categorical_as_string_missing, SKLearnIterativeImputer(strategy="missforest"))

In [None]:
evaluator_forest_cat_as_string.evaluate(10).result