In [None]:
from pathlib import Path
import h2o
from h2o.estimators import H2OXGBoostEstimator
from tqdm import tqdm
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

input_path = Path('/kaggle/input/tabular-playground-series-jun-2022/')
dataset = pd.read_csv(input_path / 'data.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv')

In [None]:
h2o.init()

# Goal of This Notebook

XGBoost is a great, robust tool for predicting tabular data. However due to its convergence speed sometimes it is not preferred (at least at first glance). Yet there are a couple of ways to speed up the convergence speed. One of them is to enable using histograms like in the LGBM algorithm.

Iterative Imputer is a great way to handle missing data. However, the convergence speed is very low when it is used with any tree algorithms. We could use linear models if the features are linearly correlated but from the Pearson correlations, we see that it is not the case. 

One idea is using Iterative Imputer with XGBoost algorithm while enabling gpu_hist. In this [notebook](https://www.kaggle.com/code/hiro5299834/tps-jun-2022-iterativeimputer-baseline), you can see how it is applied.

Another idea is using h2o.xgboost instead of the regular one. The claim is that h2o.xgboost is significantly faster both in CPU and GPU (for the full experiment and conclusion you can visit [here](https://sefiks.com/2019/11/07/why-you-should-build-xgboost-models-within-h2o/#:~:text=To%20sum%20up%2C%20h2o%20distribution,for%20a%20large%20data%20set.)).

* "Besides, training lasts 204 seconds in h2o when GPU is enabled whereas regular XGBoost cannot handle memory if GPU is enabled and this causes the kernel to die."

* "To sum up, h2o distribution is 1.6 times faster than the regular xgboost on CPU. Besides, building a model on a GPU can be run on just h2o for a large data set."

Regular XGBoost supports GPU under the tree method *gpu_hist*.

H2O supports GPU under several tree methods. For the large datasets it uses *approx* tree method which creates bins in every iterations. Optionally *hist* tree method can be used.

In this notebook, I tested the second claim to see whether it is true or not. Hope you'll enjoy it!

## 1. h2o.XGBoost (tree_method = approx) 

In [None]:
dataset_h2o = h2o.H2OFrame(dataset)

params = {
    'ntrees': 250,
    'booster': 'gbtree',
    'backend': 'gpu',
    'reg_lambda': 0.5013016642587416,
    'reg_alpha': 0.48576060322334563,
    'colsample_bytree': 0.9,
    'subsample': 1.0,
    'learn_rate': 0.1,
    'max_depth': 9,
    'min_child_weight': 3,
    'stopping_metric': 'RMSE'}

In [None]:
%%time
pred_list = {}

features = dataset_h2o.columns[1:]
for feat in tqdm(features):
    if dataset_h2o[feat].isna().any():
        missing_rows = dataset_h2o[feat].isna()
        train_data = dataset_h2o.drop(["row_id"],axis = 1)[~missing_rows,:]
        test_data = dataset_h2o.drop(["row_id", feat], axis = 1)[missing_rows,:]
        model = H2OXGBoostEstimator(**params)
        model.train(y = feat, training_frame = train_data)
        preds = model.predict(test_data).as_data_frame().values.ravel()
        pred_list[feat] = preds

In [None]:
dataset_h2o = dataset_h2o.as_data_frame()
for col,val in pred_list.items():
    dataset_h2o.loc[dataset_h2o[col].isnull(),col] = val

In [None]:
row_cols = sample_submission["row-col"].str.split("-", expand = True)
relevant_rows = row_cols[0].astype(np.int32).values
relevant_columns = row_cols[1].values
predictions = []
for row, col in zip(relevant_rows,relevant_columns):
    predictions.append(dataset_h2o.loc[row,col])
submission = pd.DataFrame()
submission['row-col'] = sample_submission["row-col"]
submission["value"] = predictions
submission.to_csv("submission.csv", index = False)

# Score: 0.93

## 2. Regular XGBoost (gpu-hist)

In [None]:
dataset_xgb = dataset.copy()

In [None]:
params = {
    'n_estimators': 250,
    'booster': 'gbtree',
    'reg_lambda': 0.5013016642587416,
    'reg_alpha': 0.48576060322334563,
    'colsample_bytree': 0.9,
    'missing_values': np.nan,
    'subsample': 1.0,
    'learn_rate': 0.1,
    'max_depth': 9,
    'min_child_weight': 3,
    'stopping_metric': 'RMSE',
    'tree_method': 'gpu_hist',
    'verbosity' : 0 }

In [None]:
%%time
pred_list = {}
for feat in tqdm(features):
    if dataset_xgb[feat].isnull().any():
        missing_rows = np.where(dataset_xgb.loc[:,feat].isnull())[0]
        non_missing_rows = np.where(dataset_xgb.loc[:,feat].notnull())[0]
        train_data = dataset_xgb.drop(["row_id"],axis = 1).iloc[non_missing_rows,:].reset_index(drop = True)
        test_data = dataset_xgb.drop(["row_id", feat], axis = 1).iloc[missing_rows,:].reset_index(drop = True)
        X = train_data.drop(feat, axis = 1)
        y = train_data[feat]
        model = xgb.XGBRegressor(**params)
        model.fit(X,y)
        preds = model.predict(test_data)
        pred_list[feat] = preds

In [None]:
for col,val in pred_list.items():
    dataset_xgb.loc[dataset_xgb[col].isnull(),col] = val

## 3. h2o.XGBoost (tree_method = hist)

In [None]:
dataset_h2o = h2o.H2OFrame(dataset)

params = {
    'ntrees': 250,
    'booster': 'gbtree',
    'backend': 'gpu',
    'reg_lambda': 0.5013016642587416,
    'reg_alpha': 0.48576060322334563,
    'colsample_bytree': 0.9,
    'tree_method': 'hist',
    'subsample': 1.0,
    'learn_rate': 0.1,
    'max_depth': 9,
    'min_child_weight': 3,
    'stopping_metric': 'RMSE'}

In [None]:
%%time
pred_list = {}

features = dataset_h2o.columns[1:]
for feat in tqdm(features):
    if dataset_h2o[feat].isna().any():
        missing_rows = dataset_h2o[feat].isna()
        train_data = dataset_h2o.drop(["row_id"],axis = 1)[~missing_rows,:]
        test_data = dataset_h2o.drop(["row_id", feat], axis = 1)[missing_rows,:]
        model = H2OXGBoostEstimator(**params)
        model.train(y = feat, training_frame = train_data)
        preds = model.predict(test_data).as_data_frame().values.ravel()
        pred_list[feat] = preds

## Dataset Conversion Time

In [None]:
#%%time
#a = dataset_h2o.as_data_frame()

# Results

* The total run time of h2o.XGBoost is 1.04 and 1.05 hours for approx and hist tree methods respectively whereas regular XGBoost with same hyperparameters took 17 min. There is a big difference between regular XGBoost and the other. 

* One thing that I realized during training was converting Pandas DataFrame to H2OFrame takes a long time. Because of that, I suspected that some of the difference might be caused due to converting H2OFrame to Pandas DataFrame so I decided to calculate the total time of this process. The conversion time is approximately 14 seconds. There are 55 columns having missing values so total conversion time was approximately 13 min. The difference between two algorithms can not be explained with conversion time. 

## Other Inspirational Kernels

* https://www.kaggle.com/code/cv13j0/tps-jun22-nn-multivariate-feature-imputation/notebook
* https://www.kaggle.com/code/hiro5299834/tps-jun-2022-iterativeimputer-baseline

## References

* https://sefiks.com/2019/11/07/why-you-should-build-xgboost-models-within-h2o/#:~:text=To%20sum%20up%2C%20h2o%20distribution,for%20a%20large%20data%20set.
* https://xgboost.readthedocs.io/en/stable/gpu/index.html
* https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/xgboost.html


Thanks...