# Random Forest Regression

## Libraries

In [14]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

## Data ingestion

In [2]:
!mkdir bike-sharing-dataset
!wget -P bike-sharing-dataset https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
!tar -zxvf bike-sharing-dataset/Bike-Sharing-Dataset.zip -C bike-sharing-dataset/

--2022-04-29 15:24:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279992 (273K) [application/x-httpd-php]
Saving to: 'bike-sharing-dataset/Bike-Sharing-Dataset.zip'

     0K .......... .......... .......... .......... .......... 18% 69.9K 3s
    50K .......... .......... .......... .......... .......... 36%  279K 2s
   100K .......... .......... .......... .......... .......... 54% 6.58M 1s
   150K .......... .......... .......... .......... .......... 73%  290K 0s
   200K .......... .......... .......... .......... .......... 91% 5.34M 0s
   250K .......... .......... ...                             100% 29.0M=1.1s

2022-04-29 15:24:36 (252 KB/s) - 'bike-sharing-dataset/Bike-Sharing-Dataset.zip' saved [279992/279992]

x Readme

In [36]:
df_raw = pd.read_csv('bike-sharing-dataset/day.csv')

## Data cleaning

In [37]:
df_raw = df_raw.drop(columns=['dteday'])

In [38]:
X = df_raw.iloc[:, :-1]
y = df_raw.iloc[:, -1]

## Modeling

In [10]:
rf = RandomForestRegressor(n_estimators=10, random_state=2, n_jobs=-1)

In [11]:
scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=10)

In [12]:
rmse = np.sqrt(-scores)
print('RMSE:', np.round(rmse, 3))
print('RMSE mean:', np.round(rmse.mean(), 3))

RMSE: [136.59  186.361 126.808  97.801 107.66  127.78  184.832 191.071 247.867
 304.205]
RMSE mean: 171.097


## Case

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [44]:
def randomized_search_reg(params, runs=16, reg=RandomForestRegressor(random_state=2, n_jobs=-1)):
    rand_reg = RandomizedSearchCV(reg, params, n_iter=runs, scoring='neg_mean_squared_error', cv=10, n_jobs=-1, random_state=2)
    rand_reg.fit(X_train, y_train)
    best_model = rand_reg.best_estimator_
    best_params = rand_reg.best_params_
    print('Best Params:', best_params)
    best_score = np.sqrt(-rand_reg.best_score_)
    print('Training score: {:.2f}'.format(best_score))
    y_pred = best_model.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred)**0.5
    print('Test set score: {:.2f}'.format(rmse_test))

In [45]:
randomized_search_reg(params=
                      {'min_weight_fraction_leaf': [0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05],
                       'min_samples_split': [2, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.1],
                       'min_samples_leaf': [1,2,4,6,8,10,20,30],
                       'min_impurity_decrease': [0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
                       'max_leaf_nodes': [10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                       'max_features': ['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
                       'max_depth':[None,2,4,6,8,10,20]}
                     )

Best Params: {'min_weight_fraction_leaf': 0.0, 'min_samples_split': 0.03, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.05, 'max_leaf_nodes': 25, 'max_features': 0.7, 'max_depth': None}
Training score: 227.96
Test set score: 266.33


In [46]:
randomized_search_reg(params={'min_samples_leaf':
[1,2,4,6,8,10,20,30],'min_impurity_decrease':[0.0,
0.01, 0.05, 0.10, 0.15, 0.2],'max_features':['auto',
0.8, 0.7, 0.6, 0.5, 0.4],'max_depth':
[None,4,6,8,10,12,15,20]}, runs=20)

Best Params: {'min_samples_leaf': 4, 'min_impurity_decrease': 0.1, 'max_features': 0.8, 'max_depth': 8}
Training score: 162.49
Test set score: 195.77


In [47]:
randomized_search_reg(params={'min_samples_leaf':
[1,2,3,4,5,6], 'min_impurity_decrease':[0.0, 0.01,
0.05, 0.08, 0.10, 0.12, 0.15], 'max_features':['auto',
0.8, 0.7, 0.6, 0.5, 0.4],'max_depth':
[None,8,10,12,14,16,18,20]})

Best Params: {'min_samples_leaf': 1, 'min_impurity_decrease': 0.05, 'max_features': 0.7, 'max_depth': 18}
Training score: 153.35
Test set score: 176.60


In [48]:
randomized_search_reg(params={'min_samples_leaf':
[1,2,4,6,8,10,20,30], 'min_impurity_decrease':[0.0,
0.01, 0.05, 0.10, 0.15, 0.2], 'max_features':['auto',
0.8, 0.7, 0.6, 0.5, 0.4],'max_depth':
[None,4,6,8,10,12,15,20],'n_estimators':[100]},
runs=30)

Best Params: {'n_estimators': 100, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.15, 'max_features': 0.8, 'max_depth': 12}
Training score: 140.43
Test set score: 171.65


In [50]:
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=2, min_impurity_decrease=0.15, max_features=0.8, max_depth=12, n_jobs=-1, random_state=2)
scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=10)
rmse = np.sqrt(-scores)

print('RMSE:', np.round(rmse, 3))
print('RMSE mean: %0.3f' % (rmse.mean()))

RMSE: [117.451 148.425 111.988  88.424  87.747 151.587 211.674 157.06  292.634
 369.519]
RMSE mean: 173.651
