https://www.hackerearth.com/challenges/competitive/get-a-room-ml-hackathon/machine-learning/identify-the-habitability-score-of-a-property-12-464aae3e/

In [1]:
# imports

import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
dataset_path = Path("/home/tharun/projects/data_science_competitions/datasets/he_habitability/")
print([x.name for x in dataset_path.iterdir()])

['train_v2.csv', 'test_xgb_blend_final.csv', 'train.csv', 'train_v1.csv', 'subm_v1.csv', 'train_xgb_blend_final.csv', 'subm_v2.csv', 'sample_submission.csv', 'subm_v3.csv', 'test.csv', 'test_v1.csv']


In [5]:
results_path = Path("/home/tharun/projects/data_science_competitions/hacker_earth/habitability_score_prediction/blending_results")
[x.name for x in results_path.iterdir()]

['xgb_valid_pred_3.csv',
 'xgb_valid_pred_2.csv',
 'xgb_test_pred_3.csv',
 'xgb_test_pred_1.csv',
 'xgb_test_pred_2.csv',
 'xgb_blend_final_with_lr.csv',
 'xgb_valid_pred_1.csv']

In [51]:
train_df = pd.read_csv(dataset_path/"train_xgb_blend_final.csv")
test_df = pd.read_csv(dataset_path/"test_xgb_blend_final.csv")
sample_submission_df = pd.read_csv(dataset_path/"sample_submission.csv")

print(train_df.shape, test_df.shape)

(39500, 19) (10500, 17)


## utils

In [10]:
def make_sub_file(test_ids, preds):
    return pd.DataFrame.from_dict({"Property_ID": test_ids, "Habitability_score": preds})

## data preprocessing

In [19]:
useful_features = ["pred_1", "pred_2", "pred_3"]
useful_features

['pred_1', 'pred_2', 'pred_3']

## Optimizing the blend

In [52]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import optuna
from sklearn.metrics import r2_score
import warnings

In [53]:
warnings.filterwarnings("ignore")

### RandomForestRegressor

In [66]:
def objective(trial):
    max_depth = trial.suggest_int("max_depth", 2, 9)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 11)
    max_features = trial.suggest_float("max_features", 0.2, 1.)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    bootstrap = trial.suggest_categorical("bootstrap", ["True", "False"])
    
    valid_scores = []
    for fold in range(5):
        xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
        xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.Habitability_score
        yvalid = xvalid.Habitability_score
        
        xtest = test_df[useful_features]
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        model = RandomForestRegressor(bootstrap=bootstrap,
                                      max_features=max_features,
                                      min_samples_leaf=min_samples_leaf,
                                      max_depth=max_depth,
                                      min_samples_split=min_samples_split,
                                      n_estimators=10000,
                                      n_jobs=-1,
                                      random_state=13)
        model.fit(xtrain, ytrain)
        valid_preds = model.predict(xvalid)
        score = max(0, 100 * r2_score(yvalid, valid_preds))
        valid_scores.append(score)
    return np.mean(valid_scores)

In [68]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2022-08-14 12:58:26,558][0m A new study created in memory with name: no-name-b888a648-fd82-4a3a-ad8a-0506691851cf[0m
[32m[I 2022-08-14 12:59:26,496][0m Trial 0 finished with value: 81.53934818028878 and parameters: {'max_depth': 3, 'min_samples_split': 7, 'max_features': 0.8644331388712838, 'min_samples_leaf': 3, 'bootstrap': 'False'}. Best is trial 0 with value: 81.53934818028878.[0m
[32m[I 2022-08-14 13:00:43,499][0m Trial 1 finished with value: 82.8765141851965 and parameters: {'max_depth': 5, 'min_samples_split': 2, 'max_features': 0.8291321369860194, 'min_samples_leaf': 3, 'bootstrap': 'False'}. Best is trial 1 with value: 82.8765141851965.[0m
[32m[I 2022-08-14 13:02:00,586][0m Trial 2 finished with value: 82.90026950845177 and parameters: {'max_depth': 8, 'min_samples_split': 10, 'max_features': 0.440814295459153, 'min_samples_leaf': 3, 'bootstrap': 'True'}. Best is trial 2 with value: 82.90026950845177.[0m
[32m[I 2022-08-14 13:03:41,067][0m Trial 3 finished 

In [69]:
test_scores = []
for fold in range(5):
    xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
    xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score

    xtest = test_df[useful_features]
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    model = RandomForestRegressor(**study.best_params,
                                  n_estimators=10000,
                                  n_jobs=-1,
                                  random_state=13)
    model.fit(xtrain, ytrain)
    valid_preds = model.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, valid_preds))}")
    test_scores.append(model.predict(xtest))
    
test_scores = np.mean(np.column_stack(test_scores), axis=1)
subm_df = make_sub_file(test_df.Property_ID, test_scores)
subm_df.head()

fold: 0, score: 83.2187004773493
fold: 1, score: 82.7539929780565
fold: 2, score: 83.72177686638112
fold: 3, score: 82.12842831127232
fold: 4, score: 82.69688979383182


Unnamed: 0,Property_ID,Habitability_score
0,0x6e93,29.60389
1,0x8787,80.601629
2,0x6c17,67.516084
3,0x9dbd,72.282627
4,0xbfde,77.691985


In [70]:
subm_df.to_csv(results_path/"xgb_blend_final_with_rfr.csv", index=False)

### XGBRegressor

In [71]:
def objective(trial):
    valid_scores = []
    
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.3)
    max_depth = trial.suggest_int("max_depth", 3, 11)
    reg_lambda = trial.suggest_float("reg_lambda", 1e-8, 100.)
    reg_alpha = trial.suggest_float("reg_alpha", 1e-8, 100.)
    subsample = trial.suggest_float("subsample", 0.1, 1.)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.)
    
    for fold in range(5):
        xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
        xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)
        ytrain = xtrain.Habitability_score
        yvalid = xvalid.Habitability_score
        
        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        model = XGBRegressor(random_state=13,
                             learning_rate=learning_rate,
                             reg_lambda=reg_lambda,
                             reg_alpha=reg_alpha,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             max_depth=max_depth,
                             n_estimators=7000,
                             gpu_id=0,
                             predictor="gpu_predictor",
                             tree_method="gpu_hist",
                            )
        model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
        valid_preds = model.predict(xvalid)
        valid_scores.append(max(0, r2_score(yvalid, valid_preds)))
    return np.mean(valid_scores)

In [72]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)

[32m[I 2022-08-15 10:53:45,284][0m A new study created in memory with name: no-name-c13b4a16-5dcc-4fe7-ad41-3a4836c4d7e5[0m
[32m[I 2022-08-15 10:53:50,481][0m Trial 0 finished with value: 0.8285475516432206 and parameters: {'learning_rate': 0.1052361386209996, 'max_depth': 7, 'reg_lambda': 96.28966458530262, 'reg_alpha': 81.94102310945172, 'subsample': 0.4828168068277682, 'colsample_bytree': 0.9089106749033717}. Best is trial 0 with value: 0.8285475516432206.[0m
[32m[I 2022-08-15 10:53:52,692][0m Trial 1 finished with value: 0.8267180360053666 and parameters: {'learning_rate': 0.191667012270999, 'max_depth': 4, 'reg_lambda': 11.541120959705996, 'reg_alpha': 19.629188899945937, 'subsample': 0.10553459238591721, 'colsample_bytree': 0.6549655123701772}. Best is trial 0 with value: 0.8285475516432206.[0m
[32m[I 2022-08-15 10:53:55,736][0m Trial 2 finished with value: 0.8265547623812278 and parameters: {'learning_rate': 0.274688246008104, 'max_depth': 6, 'reg_lambda': 5.985914729

XGBoostError: [11:22:26] /home/conda/feedstock_root/build_artifacts/xgboost-split_1645117836726/work/src/tree/updater_gpu_hist.cu:770: Exception in gpu_hist: [11:22:26] /home/conda/feedstock_root/build_artifacts/xgboost-split_1645117836726/work/src/common/device_helpers.cuh:132: NCCL failure :unhandled system error /home/conda/feedstock_root/build_artifacts/xgboost-split_1645117836726/work/src/common/device_helpers.cuh(905)
Stack trace:
  [bt] (0) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x1d0f1f) [0x7fe9613f0f1f]
  [bt] (1) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(dh::ThrowOnNcclError(ncclResult_t, char const*, int)+0x36f) [0x7fe961661e3f]
  [bt] (2) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(dh::AllReducer::Init(int)+0x523) [0x7fe9616613b3]
  [bt] (3) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::InitDataOnce(xgboost::DMatrix*)+0x132) [0x7fe9618acae2]
  [bt] (4) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x210) [0x7fe9618babf0]
  [bt] (5) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa34) [0x7fe9614c2904]
  [bt] (6) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::PredictionCacheEntry*)+0x358) [0x7fe9614c5e98]
  [bt] (7) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x2c297a) [0x7fe9614e297a]
  [bt] (8) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(XGBoosterUpdateOneIter+0x7b) [0x7fe9613cfedb]



Stack trace:
  [bt] (0) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x677df4) [0x7fe961897df4]
  [bt] (1) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x963) [0x7fe9618bb343]
  [bt] (2) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa34) [0x7fe9614c2904]
  [bt] (3) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::PredictionCacheEntry*)+0x358) [0x7fe9614c5e98]
  [bt] (4) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x2c297a) [0x7fe9614e297a]
  [bt] (5) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(XGBoosterUpdateOneIter+0x7b) [0x7fe9613cfedb]
  [bt] (6) /home/tharun/.conda/envs/torch_env/lib/python3.10/lib-dynload/../../libffi.so.7(+0x69dd) [0x7fe9cf25c9dd]
  [bt] (7) /home/tharun/.conda/envs/torch_env/lib/python3.10/lib-dynload/../../libffi.so.7(+0x6067) [0x7fe9cf25c067]
  [bt] (8) /home/tharun/.conda/envs/torch_env/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x13a25) [0x7fe9cf275a25]



In [74]:
study.best_trial

FrozenTrial(number=411, values=[0.829123710361723], datetime_start=datetime.datetime(2022, 8, 15, 11, 18, 56, 528935), datetime_complete=datetime.datetime(2022, 8, 15, 11, 18, 59, 845287), params={'learning_rate': 0.018587181298248073, 'max_depth': 3, 'reg_lambda': 5.9693592762503105, 'reg_alpha': 14.744979871436142, 'subsample': 0.20345267419565888, 'colsample_bytree': 0.881565098234934}, distributions={'learning_rate': UniformDistribution(high=0.3, low=0.01), 'max_depth': IntUniformDistribution(high=11, low=3, step=1), 'reg_lambda': UniformDistribution(high=100.0, low=1e-08), 'reg_alpha': UniformDistribution(high=100.0, low=1e-08), 'subsample': UniformDistribution(high=1.0, low=0.1), 'colsample_bytree': UniformDistribution(high=1.0, low=0.1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=411, state=TrialState.COMPLETE, value=None)

In [75]:
test_scores = []
for fold in range(5):
    xtrain = train_df[train_df.kfold != fold].reset_index(drop=True)
    xvalid = train_df[train_df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.Habitability_score
    yvalid = xvalid.Habitability_score

    xtest = test_df[useful_features]
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    model = XGBRegressor(**study.best_params,
                         n_estimators=7000,
                         random_state=13,
                         gpu_id=0,
                         predictor="gpu_predictor",
                         tree_method="gpu_hist"
                        )
    model.fit(xtrain, ytrain, eval_set=[(xvalid, yvalid)], early_stopping_rounds=300, verbose=False)
    valid_preds = model.predict(xvalid)
    print(f"fold: {fold}, score: {max(0, 100 * r2_score(yvalid, valid_preds))}")
    test_scores.append(model.predict(xtest))
    
test_scores = np.mean(np.column_stack(test_scores), axis=1)
subm_df = make_sub_file(test_df.Property_ID, test_scores)
subm_df.head()

XGBoostError: [11:33:56] /home/conda/feedstock_root/build_artifacts/xgboost-split_1645117836726/work/src/tree/updater_gpu_hist.cu:770: Exception in gpu_hist: [11:33:56] /home/conda/feedstock_root/build_artifacts/xgboost-split_1645117836726/work/src/common/device_helpers.cuh:132: NCCL failure :unhandled system error /home/conda/feedstock_root/build_artifacts/xgboost-split_1645117836726/work/src/common/device_helpers.cuh(905)
Stack trace:
  [bt] (0) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x1d0f1f) [0x7fe9613f0f1f]
  [bt] (1) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(dh::ThrowOnNcclError(ncclResult_t, char const*, int)+0x36f) [0x7fe961661e3f]
  [bt] (2) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(dh::AllReducer::Init(int)+0x523) [0x7fe9616613b3]
  [bt] (3) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::InitDataOnce(xgboost::DMatrix*)+0x132) [0x7fe9618acae2]
  [bt] (4) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x210) [0x7fe9618babf0]
  [bt] (5) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa34) [0x7fe9614c2904]
  [bt] (6) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::PredictionCacheEntry*)+0x358) [0x7fe9614c5e98]
  [bt] (7) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x2c297a) [0x7fe9614e297a]
  [bt] (8) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(XGBoosterUpdateOneIter+0x7b) [0x7fe9613cfedb]



Stack trace:
  [bt] (0) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x677df4) [0x7fe961897df4]
  [bt] (1) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x963) [0x7fe9618bb343]
  [bt] (2) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa34) [0x7fe9614c2904]
  [bt] (3) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::PredictionCacheEntry*)+0x358) [0x7fe9614c5e98]
  [bt] (4) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(+0x2c297a) [0x7fe9614e297a]
  [bt] (5) /home/tharun/.conda/envs/torch_env/lib/libxgboost.so(XGBoosterUpdateOneIter+0x7b) [0x7fe9613cfedb]
  [bt] (6) /home/tharun/.conda/envs/torch_env/lib/python3.10/lib-dynload/../../libffi.so.7(+0x69dd) [0x7fe9cf25c9dd]
  [bt] (7) /home/tharun/.conda/envs/torch_env/lib/python3.10/lib-dynload/../../libffi.so.7(+0x6067) [0x7fe9cf25c067]
  [bt] (8) /home/tharun/.conda/envs/torch_env/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x13a25) [0x7fe9cf275a25]



In [None]:
subm_df.to_csv(results_path/"xgb_blend_final_with_xgb.csv", index=False)