## Libraries

In [43]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

## Data Ingestion

In [4]:
!mkdir bike-sharing-dataset
!wget -P bike-sharing-dataset https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
!tar -zxvf bike-sharing-dataset/Bike-Sharing-Dataset.zip -C bike-sharing-dataset/

--2022-05-01 20:02:01--  https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279992 (273K) [application/x-httpd-php]
Saving to: 'bike-sharing-dataset/Bike-Sharing-Dataset.zip'

     0K .......... .......... .......... .......... .......... 18%  139K 2s
    50K .......... .......... .......... .......... .......... 36%  133K 1s
   100K .......... .......... .......... .......... .......... 54% 11.3M 1s
   150K .......... .......... .......... .......... .......... 73%  280K 0s
   200K .......... .......... .......... .......... .......... 91% 3.69M 0s
   250K .......... .......... ...                             100% 29.4M=0.9s

2022-05-01 20:02:04 (293 KB/s) - 'bike-sharing-dataset/Bike-Sharing-Dataset.zip' saved [279992/279992]

x Readme

In [5]:
df_raw = pd.read_csv('bike-sharing-dataset/day.csv')

## Data Cleaning

In [6]:
df_raw = df_raw.drop(columns=['dteday'])

## Train-test split

In [7]:
X = df_raw.iloc[:, :-1]
y = df_raw.iloc[:, -1]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

## Gradient Boosting Model from Scratch

### First tree

In [13]:
tree_1 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_1.fit(X_train, y_train)
y_train_pred = tree_1.predict(X_train)
y2_train = y_train - y_train_pred

### Second tree

In [14]:
tree_2 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_2.fit(X_train, y2_train)
y2_train_pred = tree_2.predict(X_train)
y3_train = y2_train - y2_train_pred

### Third tree

In [16]:
tree_3 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_3.fit(X_train, y3_train);

### Results

In [17]:
y1_pred = tree_1.predict(X_test)
y2_pred = tree_2.predict(X_test)
y3_pred = tree_3.predict(X_test)

In [18]:
y_pred = y1_pred + y2_pred + y3_pred

Tree 1

In [21]:
mean_squared_error(y_test, y1_pred)**0.5

914.615152637184

Tree 2

In [22]:
mean_squared_error(y_test, y2_pred)**0.5

4888.852579397962

Tree 3

In [23]:
mean_squared_error(y_test, y3_pred)**0.5

5077.739428116499

Ensemble

In [20]:
mean_squared_error(y_test, y_pred)**0.5

505.62637078946926

## GradientBoostingRegressor from sklearn

Reproducing the results above.

In [26]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=2, learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

505.62637078946875

Results with 30 estimators.

In [27]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=30, random_state=2, learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

219.62831847757047

Results with 300 estimators.

In [28]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

192.6698160666761

Results with 3000 estimators. (worse results)

In [29]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3000, random_state=2, learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

194.098879182156

Results with 3000 estimators, but default learning_rate

In [30]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3000, random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
mean_squared_error(y_test, y_pred)**0.5

94.90085398362392

## Hyperparameters

### learning_rate

In [36]:
learning_rate_values = np.arange(0.1, 1.1, 0.1)

for lr in learning_rate_values:
    gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=lr)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    print('Learning rate: {:.3f} RMSE: {:.3f}'.format(lr, rmse))

Learning rate: 0.100 RMSE: 109.430
Learning rate: 0.200 RMSE: 97.276
Learning rate: 0.300 RMSE: 130.219
Learning rate: 0.400 RMSE: 128.424
Learning rate: 0.500 RMSE: 167.942
Learning rate: 0.600 RMSE: 156.558
Learning rate: 0.700 RMSE: 134.542
Learning rate: 0.800 RMSE: 120.627
Learning rate: 0.900 RMSE: 139.483
Learning rate: 1.000 RMSE: 192.670


## max_depth

In [41]:
depths = [None, 1, 2, 3, 4]

for depth in depths:
    gbr = GradientBoostingRegressor(max_depth=depth, n_estimators=300, random_state=2, learning_rate=0.2)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    print('Max Depth:', depth, ', Score:', rmse)

Max Depth: None , Score: 214.8307991962222
Max Depth: 1 , Score: 139.31555126709327
Max Depth: 2 , Score: 97.27603855685629
Max Depth: 3 , Score: 138.31768577845503
Max Depth: 4 , Score: 122.75612942520434


### subsample

In [42]:
samples = [1, 0.9, 0.8, 0.7, 0.6, 0.5]

for sample in samples:
    gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, subsample=sample, random_state=2, learning_rate=.2)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    print('Subsample:', sample, ', Score:', rmse)

Subsample: 1 , Score: 97.27603855685629
Subsample: 0.9 , Score: 114.66071038389832
Subsample: 0.8 , Score: 116.8070546058355
Subsample: 0.7 , Score: 109.25722905883788
Subsample: 0.6 , Score: 122.2876603652623
Subsample: 0.5 , Score: 115.80226843747677


## Tuning

In [72]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, subsample=sample, random_state=2, learning_rate=.2)

In [73]:
params={'subsample':[0.65, 0.7, 0.75, 1],
        'n_estimators':[30, 300, 500, 1000, 2000, 3000],
        'learning_rate':[0.05, 0.075, 0.1, 0.2]}

In [74]:
%%time
rand_reg = GridSearchCV(gbr, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
rand_reg.fit(X_train, y_train)

CPU times: total: 2.8 s
Wall time: 51.9 s


GridSearchCV(cv=5,
             estimator=GradientBoostingRegressor(learning_rate=0.2, max_depth=2,
                                                 n_estimators=300,
                                                 random_state=2,
                                                 subsample=0.5),
             n_jobs=-1,
             param_grid={'learning_rate': [0.05, 0.075, 0.1, 0.2],
                         'n_estimators': [30, 300, 500, 1000, 2000, 3000],
                         'subsample': [0.65, 0.7, 0.75, 1]},
             scoring='neg_mean_squared_error')

In [75]:
best_model = rand_reg.best_estimator_
best_params = rand_reg.best_params_
best_score = np.sqrt(-rand_reg.best_score_)

In [78]:
best_model

GradientBoostingRegressor(learning_rate=0.05, max_depth=2, n_estimators=3000,
                          random_state=2, subsample=0.7)

In [80]:
best_params

{'learning_rate': 0.05, 'n_estimators': 3000, 'subsample': 0.7}

In [77]:
print("Training score: {:.3f}".format(best_score))
y_pred = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test set score: {:.3f}'.format(rmse_test))

Training score: 94.426
Test set score: 80.781


## XGBoosting Regressor

In [79]:
from xgboost import XGBRegressor

In [None]:
params={'subsample':[0.65, 0.7, 0.75, 1],
        'n_estimators':[30, 300, 500, 1000, 2000, 3000],
        'eta':[0.05, 0.075, 0.1, 0.2]}

In [86]:
xg_reg = XGBRegressor(learning_rate=0.05, max_depth=2, n_estimators=3000,
                          random_state=2, subsample=0.7)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test set score: {:.3f}'.format(rmse_test))

Test set score: 87.575


In [87]:
%%time
rand_reg = GridSearchCV(xg_reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
rand_reg.fit(X_train, y_train)

CPU times: total: 13 s
Wall time: 35.4 s


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    callbacks=None, colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=1,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=0, gpu_id=-1, grow_policy='depthwise',
                                    importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.05, max_bin=256,
                                    max_cat_to_one...
                                    max_depth=2, max_leaves=0,
                                    min_child_weight=1, missing=nan,
                                    monotone_constraints='()',
                                    n_estimators=3000, n_jobs=0,
                                

In [88]:
best_model = rand_reg.best_estimator_
best_params = rand_reg.best_params_
best_score = np.sqrt(-rand_reg.best_score_)

In [89]:
best_model

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=2, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=3000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=2,
             reg_alpha=0, reg_lambda=1, ...)

In [91]:
best_params

{'learning_rate': 0.05, 'n_estimators': 3000, 'subsample': 0.7}

In [90]:
print("Training score: {:.3f}".format(best_score))
y_pred = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test set score: {:.3f}'.format(rmse_test))

Training score: 96.755
Test set score: 87.575
