# Tabular Playground Series - June 2022
Hans Elliott

**Task: Imputation on large dataset**  
**Criterion: RMSE**    
Models:
- KNN (parallelized with Pool)
- "Missing Forest", i.e. iterative imputer with RF regressor (parallelized with Pool)  
- XGBoost with GPU acceleration
- XGBoost + mean imputation - score: 0.94892 rmse

In [None]:
import os
import numpy as np
import pandas as pd
# for knn
from sklearn.impute import KNNImputer
# for missing forest
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
# XG Boost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error as MSE
# Neural Net
import tensorflow as tf
# misc
from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool

In [None]:
multiprocessing.cpu_count() ##using GPUs

Competition: https://www.kaggle.com/competitions/tabular-playground-series-jun-2022  
EDA/Simple Baseline Notebook: https://www.kaggle.com/code/hanselliott/tabularjun2022-eda-simplerulebaseline?scriptVersionId=99652696 

In [None]:
raw_data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
sample_sub = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")
sample_sub.head()

In [None]:
raw_data.shape

**Imputing missing values, all numeric variables**

In [None]:
raw_data.describe()

In [None]:
train = raw_data.drop(labels='row_id',axis=1)
train.shape

In [None]:
print("Total missing vals: ", train.isna().sum().sum() )

In [None]:
# NO F2s are missing
train.info()

# Parallel K = 5 Nearest Neighbors Imputation Function

In [None]:
def knn_impute(df):
    imputer = KNNImputer(n_neighbors=5, weights="distance")  ##sklearn
    imputed_df = imputer.fit_transform(df)
    return imputed_df

def multiprocess_knn(df_list):
    p = Pool(processes=4)  ##multiprocessing
    data = p.map(knn_impute, [df for df in df_list])
    p.close()
    return data

# Parallel Missing Forest Imputation Function

In [None]:
# define missing forest imputation fns based on sklearn's iterative imputer
rf_estimator = RandomForestRegressor(n_estimators=50,
                                    criterion="squared_error", #closest option to RMSE (the challenge's criterion)
                                    n_jobs=1)

def miss_forest(df):
    imputer = IterativeImputer(estimator=rf_estimator,
                              max_iter=3,
                              min_value = -15, max_value=15, ##based on EDA, values shouldnt exceed this range
                              verbose=2)
    imputed_df = imputer.fit_transform(df)
    return imputed_df


# function to parallelize it across sub-dfs
def multi_missforest(df_list):
    p = Pool(processes=4)
    data = p.map(miss_forest, [df for df in df_list])
    p.close()
    return data

# XGBoost Imputation Function

In [None]:
def xgboost_impute(df):
    """
    Iterates through columns with missing values. Finds samples that are missing in current column col and removes them into a
    "test" subset. The non-missing samples in col are the y_train labels. The missing samples are the y_test labels (what we want to impute.)
    The remaining columns (all other than col) at the test indices are the X_test subset.
    The remaining columns (all other than col) not at the test indices are the X_train subset.
    An XGBoost model is fit on X_train and y_train, and then predicted onto X_test to impute y_test.
    """
    df = df.copy()
    cols_w_miss = df.filter(regex = "F_1|F_3|F_4").columns ##no F2 features are missing cols, and fn will fail if x_test ends up empty
    for i, col in enumerate(tqdm(cols_w_miss)):
        #Data
        y_test = df[df[col].isnull()][col]       ##the samples in col w missing vals
        y_train = df[~df[col].isnull()][col]     ##the samples in col w/out missing vals
        x_train = df[~df[col].isnull()].drop(col, axis=1)  ##the training subset without the col being imputed
        x_test = df[df[col].isnull()].drop(col, axis=1)     ##the testing subset without the col being imputed 
        #Model
        xgb = XGBRegressor(tree_method='gpu_hist',
                           predictor= "gpu_predictor", 
                           eta = 0.3,
                           max_depth = 3,
                           n_estimators = 750)
        xgb.fit(x_train, y_train)
        #RMSE Score:
        train_pred = xgb.predict(x_train)
        rmse = np.sqrt(MSE(y_train, train_pred))
        print("Column: ", col, "| Train RMSE: ", rmse)
        #Replace the missing vals with predictions
        df.loc[y_test.index, col] = xgb.predict(x_test)
    return df

# Imputation
### Create sub-dataframes for parallel imputation

In [None]:
## Break data into subsets to allow for multiprocessing (faster imputation)
sub1 = train[0:100000].copy()
sub2 = train[100000:200000].copy()
sub3 = train[200000:300000].copy()
sub4 = train[300000:400000].copy()
sub5 = train[400000:500000].copy()
sub6 = train[500000:600000].copy()
sub7 = train[600000:700000].copy()
sub8 = train[700000:800000].copy()
sub9 = train[800000:900000].copy()
sub10 = train[900000:1000000].copy()
df_list = [sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10]

In [None]:
def split_df(n, df):
    """
    Splits df into n sub-dfs and adds them to a list. Stored sequentially (in order of index)
    """
    start_i = 0
    end_i = len(df)//n
    
    df_list = []
    for i in range(0, n):
        sub_i = df[start_i:end_i].copy()
        df_list.append(sub_i)
        start_i = end_i
        end_i = end_i + len(df)//n
    
    return df_list

In [None]:
df_list = split_df(10, train)
len(df_list)

## KNN Imputation

In [None]:
# Execute KNN Algorithm (in parallel)
results_list = multiprocess_knn(df_list)
            # results_list = [df.pipe(knn_impute) for df in df_list] ##for non parallel

**Public Score: 1.31252**

## Missing Forest Imputation

In [None]:
# Execute MissForest Algorithm (in parallel)
results_list = multi_missforest(df_list)

## XGBoost Imputation
Don't want to split up the df here since XGBoost automatically computes in parallel and I am using gpu_hist/predict to take advantage of the GPU accelerator.

In [None]:
# Execute XGBoost on full df
results = xgboost_impute(train)

In [None]:
#save results to csv for backup
results.to_csv("xgb_results.csv", index=False)

In [None]:
#Takes ~1hr 30 mins to impute using the XGBoost methods (w GPUs) so I stored the results externally as backup
results = pd.read_csv("../input/xgboost-results1/xgb_results.csv")

In [None]:
print("Missing values remaining: ", results.isna().sum().sum())
print("results.shape: ", results.shape)
results.head()

**Public Score: 1.05612**

## Format/Prep Results
**(Ignore for XGBoost)**

In [None]:
print("Is len(results_list) 10?: ", len(results_list))
results_list[0]

In [None]:
## Convert imputer output to DataFrame since sklearn imputer outputs np array
colnames = train.columns.tolist()
for i in range(0, len(results_list)):
    results_list[i] = pd.DataFrame(results_list[i])
    results_list[i].columns = colnames

In [None]:
results = pd.concat(results_list) ##concatenate list of dfs into one df
results = results.reset_index()   ##reset the indices

In [None]:
print("Missing values remaining: ", results.isna().sum().sum())

# Extract Imputations ("predictions") for Submission  
This method is somewhat time consuming given the size of the data. Could potentially use the `split_df` fn to parallelize the replacement though. 

In [None]:
submission = sample_sub.copy()
for i, idx in tqdm(enumerate(submission['row-col'])):    
    row = int(idx.split('-')[0])
    col = idx.split('-')[1]
    submission.iloc[i, 1] = results.loc[row, col]
submission

In [None]:
submission.to_csv('submission.csv', index=False)

---

# Another attempt
Following the methodology of some popular notebooks, I'm going to use mean imputations for the F_1 and F_3 columns. I'll keep the XGBoost imputation for the F_4 columns since it seemed to perform well on those.  
My EDA showed that F_1 and F_3 features rigidly follow normal distributions (mean 0, sd 1, min -5, max 5).  
The F_4 features are less uniform. 

In [None]:
##XGB results from above (Saved externally to avoid reimputing)
xgbresults = pd.read_csv("../input/xgboost-results1/xgb_results.csv")

In [None]:
# Column groups:
f1col = [x for x in train.columns if x.startswith('F_1')] ##mean impute
f2col = [x for x in train.columns if x.startswith('F_2')] ##no missing vals
f3col = [x for x in train.columns if x.startswith('F_3')] ##mean impute
f4col = [x for x in train.columns if x.startswith('F_4')] ##use xgboost imputes

In [None]:
##Mean impute F_1, F_3
means = train.copy()
means[f1col+f3col] = means[f1col+f3col].fillna(means[f1col+f3col].mean())
print("NAs left in df to be imputed: ", means.isna().sum().sum() )
print("NAs left in F_1, F_3: ", means[f1col+f3col].isna().sum().sum())

In [None]:
# Index imputations into submission df
submission2 = sample_sub.copy()
for i, idx in tqdm(enumerate(submission2['row-col'])):    
    row = int(idx.split('-')[0])
    col = idx.split('-')[1]
    if col not in f4col:
        submission2.iloc[i,1] = means.loc[row, col] ##use the mean imputation for f1, f3 cols
    else:
        submission2.iloc[i, 1] = xgbresults.loc[row, col] ##use the xgboost imputation for f4 cols
submission2

In [None]:
submission2.to_csv('submission2.csv', index=False)

**Public Score: 0.94892**

# Notes
Neither imputer method is fast on data of this size. KNN is slightly faster than Iterative RF but increasing K might improve performance and would slow down the algo.  
Iterative imputation with RF is quite slow on this size of data.  
The non-parametric methods are definitely useful considering the features (F_1,...,F_4) are generated from different distributions (shown in EDA).    
Implementing them in parallel helps a bit with speed but also reduces the amount of data used to impute in each sub-df.
Implementing the XGBoost algorithm is advantageous for many reasons.
- For one, XGBoost is known for performing highly on tabular data.
- Also, it can easily be implemented in parallel (it automatically uses all available threads) and can capitalize on Kaggle's GPU access.
- For `tree_method`, setting `'gpu_hist'` is recommended for higher performance on large datasets. `'gpu_hist'` is a GPU implementation of the hist algorithm, which aims to speed up training by binning input values into buckets (like a histogram). Basically, it reduces the number of unique values for each feature. And since XGBoost offers GPU support and Kaggle offers GPU access, it's super easy to implement. 
- `'gpu_predictor'` is used when `tree_method` is set to gpu_hist, per the docs.

---
NN concept

In [None]:
kernel_reg = tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=79), ##minus the col being imputed
    tf.keras.layers.Normalization(axis=-1),
    #tf.keras.layers.Dense(750, activation='relu', kernel_regularizer=kernel_reg),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=kernel_reg),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=kernel_reg),
    tf.keras.layers.Dense(1) #activation='linear')
])
model.summary()

optim = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1)

model.compile(optimizer=optim,
             loss='mse',
             metrics=[tf.keras.metrics.RootMeanSquaredError()]
             )

#fn for resetting params to randomly initialized
reset_mod = lambda model, weights: model.set_weights(weights)
mod_weights = model.get_weights()

In [None]:
def nn_impute(df):
    """
    Iterates through columns with missing values. Finds samples that are missing in current column col and removes them into a
    "test" subset. The non-missing samples in col are the y_train labels. The missing samples are the y_test labels (what we want to impute.)
    The remaining columns (all other than col) at the test indices are the X_test subset.
    The remaining columns (all other than col) not at the test indices are the X_train subset.
    A neural net model is fit on X_train and y_train, and then predicted onto X_test to impute y_test.
    """
    df = df.copy()
    reset_mod(model, mod_weights)
    #Impute certain cols
    cols_w_miss = df.filter(regex = "F_3|F_4").columns ##no F2 features are missing cols, and fn will fail if x_test ends up empty
    for i, col in enumerate(tqdm(cols_w_miss)):
        # Data
        y_test = df[df[col].isnull()][col]                 ##the samples in col w missing vals
        y_train = df[~df[col].isnull()][col]               ##the samples in col w/out missing vals
        x_train = df[~df[col].isnull()].drop(col, axis=1)  ##the training subset without the col being imputed
        x_test = df[df[col].isnull()].drop(col, axis=1)    ##the testing subset without the col being imputed 
        
        # Model
        model.fit(x_train, y_train, epochs=5)
        # RMSE Score:
        #train_pred = xgb.predict(x_train)
        #rmse = np.sqrt(MSE(y_train, train_pred))
        #print("Column: ", col, "| Train RMSE: ", rmse)
        # Replace the missing vals with predictions
        df.loc[y_test.index, col] = model.predict(x_test)
    return df