In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
data = load_breast_cancer()

In [None]:
data = pd.DataFrame(data['data'], columns = data['feature_names'])

In [None]:
data.columns

In [None]:
# create new categorical column
data['category'] = np.where(data['mean radius'] > 15, 'large', 'small')

In [None]:
data_na = data.copy()

In [None]:
for col in [ 'mean texture', 'worst texture', 'category']:
    to_be_nas = data.sample(int(.3*data.shape[0]))
    to_be_nas[col] = np.nan
    data_na.loc[to_be_nas.index] = to_be_nas

In [None]:
data_na

In [None]:
data_na

In [None]:
from missensemble import MissEnsemble

In [None]:
estimator = MissEnsemble(n_iter=100,
                       categorical_vars=['category'],
                       #ordinal_vars=ord_cols,
                       numerical_vars=[ 'mean texture', 'worst texture'],
                       n_estimators=1000, ens_method = 'forest',
                       print_criteria = True,
                       random_state = 3)

In [None]:
data_na.dtypes.value_counts()

In [None]:
out = estimator.fit_transform(data_na)

In [None]:
estimator.criteria_


In [None]:
sns.scatterplot(estimator.criteria_)

In [None]:
estimator.plot_criteria(plot_final=False)

In [None]:
estimator.check_imputation_fit(var_name='mean texture',
                              true_values = data.loc[:, 'mean texture'], 
                               error_type = 'std_diff', 
                               plot_type = 'hist')

In [None]:
estimator.check_imputation_fit(var_name='worst texture',
                              true_values = data.loc[:, 'worst texture'], 
                               error_type = 'std_diff', 
                               plot_type = 'hist')

In [None]:
(out.loc[data_na[data_na['category'].isna()].index
, :]["category"] == data.loc[data_na[data_na['category'].isna()].index
, :]["category"] ).value_counts(dropna=False, normalize=True)


# Speed Checks

In [None]:
#n_sim = 1000
#times = np.zeros(n_sim)


#for i in np.arange(1, n_sim):
#    print(i)
#    estimator = MissEnsemble(n_iter=100,
#                           #categorical_vars=cat_cols,
#                           #ordinal_vars=ord_cols,
#                           numerical_vars=[ 'mean texture', 'worst texture'],
#                           n_estimators = 50, ens_method = 'forest',
#                           print_criteria = False,
#                           random_state = 3)
#    t0 = time.time()
#    out = estimator.fit_transform(data_na.loc[:4+i, ])
#    t1 = time.time()
#    times[i] = t1 - t0
    

In [None]:
times

In [None]:
sns.scatterplot(times[1:i])

In [None]:
times[1:i]

In [None]:
i