In [1]:
%load_ext rpy2.ipython

In [None]:
import pandas as pd
import numpy as np
import pickle_utils as pu
%R library("ggplot2")
%R install.packages("mice")
%R library("mice")
%R install.packages("lattice")
%R library("lattice")
%R install.packages("VIM")
%R library("VIM")
%R install.packages("missForest")
%R library("missForest")
%R install.packages("doParallel")
%R library("doParallel")
%R install.packages("foreach")
%R library("foreach")
%R install.packages("mlbench")
%R library("mlbench")

from rpy2.ipython.rmagic import converter

NUMBER_IMPUTATIONS = 5

# Functions

In [134]:
def normalise(dataset):
    "Normalise dataset to be between 0 and 1"
    dataset -= dataset.min()
    dataset /= dataset.max()
    return dataset

def make_random_missing(dataset_):
    "Make dataset miss at random as described in paper"
    dataset = dataset_.copy()
    missing = np.random.rand(len(dataset)) < 0.2
    for i in np.nonzero(missing)[0]:
        r = np.arange(dataset.shape[1])
        np.random.shuffle(r)
        dataset.values[i,r[:(r.shape[0]//2)]] = np.nan
    return dataset

def rmse_sum(missing_df, full_df, imputed_dfs):
    mask_missing = np.isnan(missing_df.values)
    original_df = full_df.values
    multiple_imputed_df = list((df.values if hasattr(df, 'values') else df) for df in imputed_dfs)
    print(original_df.shape, multiple_imputed_df[0].shape)
    "RMSE_sum as in Multiple Imputation Using Deep Denoising Autoencoders (Gondara & Wang 2017)"
    assert original_df.shape == multiple_imputed_df[0].shape, "data set shape not matching"
    sq_diff = (original_df-multiple_imputed_df)**2
    sq_diff[:,~mask_missing] = 0
    per_attribute_rmse = np.mean(np.sum(sq_diff, axis=1), axis=0)**.5
    assert per_attribute_rmse.shape == (original_df.shape[1],)
    return np.sum(per_attribute_rmse)

# Load and ampute dataset

In [107]:
%%R -o dataset
data(Ionosphere)
#data(BostonHousing2)
#data(BreastCancer)
#datasets <- list(as.data.frame(BostonHousing2), as.data.frame(BreastCancer), as.data.frame(Ionosphere))
dataset <- as.data.frame(Ionosphere)

In [None]:
#dataset = list(map(converter.ri2py, datasets))[2]

In [115]:
full_df = normalise(dataset.drop("Class", axis=1).drop("V2", axis=1).applymap(float))
missing_df = make_random_missing(full_df)

# MICE imputation

In [None]:
%%R -i missing_df -i NUMBER_IMPUTATIONS -o mice_imputed_df_list
mice_imputed_object <- mice(missing_df, m=NUMBER_IMPUTATIONS, maxit=50, meth='pmm', seed=500)
mice_imputed_df_list = lapply(1:NUMBER_IMPUTATIONS, function(v) return(complete(mice_imputed_object, v)))

In [112]:
mice_imputed_dfs = list(converter.ri2py(df) for df in mice_imputed_df_list)
rmse_sum(missing_df, full_df, mice_imputed_dfs)

52.614553029623565

# MissForest imputation

In [None]:
%%R -o missforest_imputed_df
mice_imputed_object <- mice(missing_df, m=NUMBER_IMPUTATIONS, maxit=50, meth='pmm', seed=500)
mice_imputed_df_list = lapply(1:NUMBER_IMPUTATIONS, function(v) return(complete(mice_imputed_object, v)))

In [112]:
rmse_sum(missing_df, full_df, [missforest_imputed_df])

52.614553029623565

# Deep Denoising Autoencoder imputation

In [131]:
pu.dump((np.isnan(missing_df.values), full_df.values), "datasets/ionosphere.pkl.gz")

In [161]:
nn_imputed_data = pu.load("vae_iterate.pkl.gz")
rmse_sum(missing_df, full_df, nn_imputed_data)

(351, 33) (351, 33)


130.51970446672618