# 55 XGBoosts with Mean Imputation


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost


In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-jun-2022/')

data = pd.read_csv(input_path / 'data.csv', index_col='row_id')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='row-col')

#  Observe Data

In [None]:
data.head()

##  See how many nans appear in each rows & cols

In [None]:
data.isnull().sum(axis=1)

In [None]:
data.isnull().sum().head()

In [None]:
fig, axs = plt.subplots(1, 2)

axs[0].hist(data.isnull().sum(axis=1), bins=range(data.isnull().sum(axis=1).max() + 1), density=True)
axs[0].set_xlabel("Num of nans in each rows")
axs[0].set_ylabel("Ratio")

axs[1].bar(list(data.isnull().sum().index), data.isnull().sum())
axs[1].set_xlabel("Col name")
plt.draw()
axs[1].set_xticklabels(axs[1].get_xticks(), rotation = 90)
axs[1].set_ylabel("Num of Nans in each cols")

plt.tight_layout()

plt.show()

### Col names having missing values
* F_1_x, F_3_x, F_4_x are having missing values
* F_2_x are complete

In [None]:
cols_with_nan = [col for col in data.columns if data.isnull().any()[col]]

print(len(cols_with_nan))
print(cols_with_nan)

### See data types
* F_1_x, F_3_x, F_4_x are float64. (They have missing values!)
* F_2_x are int 64. (The are nan-free!)

In [None]:
data[[col for col in data.columns if col[:3] == 'F_1']].dtypes

In [None]:
data[[col for col in data.columns if col[:3] == 'F_3']].dtypes

In [None]:
data[[col for col in data.columns if col[:3] == 'F_4']].dtypes

In [None]:
data[[col for col in data.columns if col[:3] == 'F_2']].dtypes

### Are F_2_x categorical?
* No, their ranges are from 0 to 11~17.
* One_hot encoding won't be a good idea.

In [None]:
data[[col for col in data.columns if col[:3] == 'F_2']].min()

In [None]:
data[[col for col in data.columns if col[:3] == 'F_2']].max()

#  Use SimpleImputer to fill in missing values

In [None]:
train_data = data.copy()

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

train_data[:] = imp.fit_transform(train_data)

In [None]:
train_data.head()

#  Train 55 RandomForest models

##  Define Training Function

In [None]:
def define_model():
    #return RandomForestRegressor(n_estimators=150, random_state=0)
    return xgboost.XGBRegressor(n_estimators=500, tree_method='gpu_hist', predictor="gpu_predictor")


def train_model(imputed_data, target_col_name, train_ratio=1.):
    good_rows = ~data[target_col_name].isnull()
    
    x_train = imputed_data[good_rows].drop([target_col_name], axis=1)
    y_train = imputed_data[good_rows][target_col_name]
    
    train_num = int(good_rows.sum() * train_ratio)
    x_train = x_train[:train_num]
    y_train = y_train[:train_num]
    
    model = define_model()
    model.fit(x_train, y_train)
    
    return model

###  Experiment with small sample

In [None]:
trnnums = [100, 500, 1000, 5000, 10000, 20000, 50000, 1000000]
vldnum = 5000
rmses = []
training_seconds = []
for trnnum in trnnums:
    # data preprocessing
    test_col_name = 'F_1_0'
    good_rows = ~data[test_col_name].isnull()

    test_x_train = train_data[good_rows].drop([test_col_name], axis=1)
    test_y_train = train_data[good_rows][test_col_name]

    model = define_model()

    # train
    print("fitting %7d," % trnnum, end='\t')
    dt_start = datetime.now()
    model.fit(test_x_train[:trnnum], test_y_train[:trnnum])
    dt_final = datetime.now()
    total_seconds = (dt_final - dt_start).total_seconds()
    print("%10f sec took. Evaluating RMSE..." % total_seconds)
    
    # evaluate
    rmse = mean_squared_error(test_y_train[-vldnum:], 
                              model.predict(test_x_train[-vldnum:]), 
                              squared=False)
    
    # save result
    training_seconds.append(total_seconds)
    rmses.append(rmse)

### Plot experimental result, vs sample size

In [None]:
fig, axs = plt.subplots(1, 2)

axs[0].plot(trnnums, rmses, markersize=10, marker=".", color='black')

axs[0].set_xlabel("Training Sample Number")
axs[0].set_ylabel("RMSEs")
axs[0].grid()

axs[1].plot(trnnums, training_seconds, marker='.', color='black')
axs[1].set_xlabel("Training Sample Number")
axs[1].set_ylabel("Elapsed Time [sec]")
axs[1].grid()

plt.tight_layout()

plt.show()

###  Test with 1,000 samples

In [None]:
trnnum = 1000
vldnum = 5000
rmses = []
training_seconds = []
cols_with_nan = [col for col in data.columns if data.isnull().any()[col]]
col_indices = []
for col_idx, test_col_name in enumerate(cols_with_nan):
    if col_idx % 2 == 0:
        continue
    
    # data preprocessing
    good_rows = ~data[test_col_name].isnull()

    test_x_train = train_data[good_rows].drop([test_col_name], axis=1)
    test_y_train = train_data[good_rows][test_col_name]

    model = define_model()

    # train
    print("fitting " + test_col_name, end='\t')
    dt_start = datetime.now()
    model.fit(test_x_train[:trnnum], test_y_train[:trnnum])
    dt_final = datetime.now()
    total_seconds = (dt_final - dt_start).total_seconds()
    print("%10f sec took. Evaluating RMSE..." % total_seconds, end='\t')
    
    # evaluate
    rmse = mean_squared_error(test_y_train[-vldnum:], 
                              model.predict(test_x_train[-vldnum:]), 
                              squared=False)
    print("RMSE: ", rmse)
    
    # save result
    col_indices.append(col_idx)
    training_seconds.append(total_seconds)
    rmses.append(rmse)

# to numpy
col_indices = np.array(col_indices)
training_seconds = np.array(training_seconds)
rmses = np.array(rmses)

###  Plot result, in col name

In [None]:
fig, axs = plt.subplots(1, 2)

axs[0].plot(col_indices, rmses, markersize=10, marker=".", color='black')
axs[0].axhline(np.mean(rmses), color='red')
axs[0].axhspan(np.mean(rmses) - np.std(rmses),
               np.mean(rmses) + np.std(rmses),
               color='red', alpha=0.1)

axs[0].set_xlabel("Column index")
axs[0].set_ylabel("RMSEs")
axs[0].grid()

axs[1].plot(col_indices, training_seconds, marker='.', color='black')
axs[1].axhline(np.mean(training_seconds), color='red')
axs[1].axhspan(np.mean(training_seconds) - np.std(training_seconds),
               np.mean(training_seconds) + np.std(training_seconds),
               color='red', alpha=0.1)
axs[1].set_xlabel("Column index")
axs[1].set_ylabel("Elapsed Time [sec]")
axs[1].grid()

plt.suptitle("Mean RMSE: %.4f\nTime: %d secs" % (np.mean(rmses), np.sum(training_seconds).astype('int')))
plt.tight_layout()

plt.show()

### Train All

In [None]:
train_ratio = 1.0

cols_with_nan = [col for col in data.columns if data.isnull().any()[col]]
models = dict()
for col_name in tqdm(cols_with_nan):
    models.update({col_name: train_model(train_data, col_name, train_ratio=train_ratio)})

#  Predict

In [None]:
pred_data = data.copy()
for col in tqdm(cols_with_nan):
    col_idx = data.index[data[col].isnull()]
    col_pred = models[col].predict(train_data.drop([col], axis=1)[data[col].isnull()])
    pred_data.loc[col_idx, col] = col_pred

## Use `row-col` from the sample submission to find the imputed values

In [None]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = pred_data.loc[row, col]

submission.to_csv('55XGBoosts_meanimputer.csv')