In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# In this example we will investigate different imputation techniques:

imputation by the constant value 0
imputation by the mean value of each feature combined with a missing-ness indicator auxiliary variable
k nearest neighbor imputation
iterative imputation

In [2]:
df = pd.read_parquet('brook_mvp_train_deputed_total_revenues_abt.parquet')

In [13]:
df

Unnamed: 0,masked_company_id,state,city,year_founded,num_employees,reporting_template_type_name,mode_period_end_month,m8_accounts_payable_total,m8_accounts_receivable_long_term,m8_accounts_receivable_total,...,m1_dkl3,m1_dkl5,m1_dkk4,m1_dkn3,m1_dkn5,m1_dkn8,m1_dkm7,m1_e8pu,target_year,target_total_revenues
0,5023,,St. Albans,2016.0,,Standard,12,,,,...,2.600000,6.875000,1.991667,3.000000,3.758333,3.791667,-2.008333,-0.075000,2019,113919.0
1,5024,,Leicestershire,2009.0,3.0,,3,0.0,0.0,2166.0,...,-0.933333,1.108333,-0.616667,1.333333,3.716667,1.250000,-3.583333,0.150000,2018,96321.0
2,5025,Essex,Ilford,1960.0,,Standard,3,0.0,0.0,0.0,...,-0.933333,1.108333,-0.616667,1.333333,3.716667,1.250000,-3.583333,0.150000,2018,97461.0
3,5025,Essex,Ilford,1960.0,,Standard,3,0.0,0.0,617073.0,...,3.916667,4.891667,2.958333,2.333333,5.241667,3.516667,0.833333,0.016667,2019,95291.0
4,5025,Essex,Ilford,1960.0,,Standard,3,240446.0,0.0,522009.0,...,2.275000,6.316667,1.775000,2.675000,3.425000,3.933333,-1.458333,-0.050000,2020,93056.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21312,5020,Greater London,London,1989.0,10.0,,3,0.0,0.0,0.0,...,2.275000,6.316667,1.775000,2.675000,3.425000,3.933333,-1.458333,-0.050000,2020,122952.0
21313,5021,West Midlands,Brierley Hill,1986.0,27.0,,3,0.0,0.0,0.0,...,-0.933333,1.108333,-0.616667,1.333333,3.716667,1.250000,-3.583333,0.150000,2018,581299.0
21314,5021,West Midlands,Brierley Hill,1986.0,27.0,,3,104.0,0.0,0.0,...,3.916667,4.891667,2.958333,2.333333,5.241667,3.516667,0.833333,0.016667,2019,556601.0
21315,5021,West Midlands,Brierley Hill,1986.0,27.0,,3,28871.0,0.0,0.0,...,2.275000,6.316667,1.775000,2.675000,3.425000,3.933333,-1.458333,-0.050000,2020,562695.0


In [15]:
df.drop(['masked_company_id', 'state', 'city'], axis =1, inplace = True)

In [17]:
df = pd.get_dummies(df, columns = ['mode_period_end_month', 'reporting_template_type_name'])

In [19]:
target = ['target_total_revenues']

features = list(set(df.columns) - set(target))

In [10]:

from sklearn.metrics import make_scorer
def mean_of_sqrt_of_quartiles_nmae(y, y_pred):
    y = pd.Series(y['target_total_revenues'].reset_index(drop = True))
    y_pred = pd.Series(y_pred.ravel())

    def nmae(true_y, predicted_y):
        return mean_absolute_error(true_y, predicted_y) /true_y.abs().mean()

    y_quartiles = pd.qcut(y, q=10)
    quartiles = list(y_quartiles.unique())

    return np.mean(
        [math.sqrt(nmae(y[y_quartiles == quartile], y_pred[y_quartiles == quartile])) for quartile in quartiles])
    

custom_scorer = make_scorer(mean_of_sqrt_of_quartiles_nmae, greater_is_better=False)



In [3]:
rng = np.random.RandomState(0)

from sklearn.ensemble import RandomForestRegressor

# To use the experimental IterativeImputer, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline


N_SPLITS = 5
regressor = RandomForestRegressor(random_state=0)

In [4]:
def get_scores_for_imputer(imputer, X_missing, y_missing):
    estimator = make_pipeline(imputer, regressor)
    impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                    scoring=custom_scorer,
                                    cv = N_SPLITS)
    return impute_scores

In [5]:
def get_impute_zero_score(X_missing, y_missing):

    imputer = SimpleImputer(missing_values = np.nan, add_indicator=True,
                            strategy='constant', fill_value=0)
    
    zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
    return zero_impute_scores.mean(), zero_impute_scores.std()


In [6]:
def get_impute_knn_score(X_missing, y_missing):
    imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
    knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
    return knn_impute_scores.mean(), knn_impute_scores.std()


In [7]:
def get_impute_mean(X_missing, y_missing):
    imputer = SimpleImputer(missing_values=np.nan, strategy="mean",
                            add_indicator=True)
    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
    return mean_impute_scores.mean(), mean_impute_scores.std()


In [8]:
def get_impute_iterative(X_missing, y_missing):
    imputer = IterativeImputer(missing_values=np.nan, add_indicator=True,
                               random_state=0, n_nearest_features=5,
                               sample_posterior=True)
    iterative_impute_scores = get_scores_for_imputer(imputer,
                                                     X_missing,
                                                     y_missing)
    return iterative_impute_scores.mean(), iterative_impute_scores.std()


In [20]:
get_impute_zero_score(df[features], df[target])

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


KeyboardInterrupt: 