## Libraries

In [32]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

## Data Ingestion

In [10]:
!mkdir bike-sharing-dataset
!wget -P bike-sharing-dataset https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
!tar -zxvf bike-sharing-dataset/Bike-Sharing-Dataset.zip -C bike-sharing-dataset/

A subdirectory or file bike-sharing-dataset already exists.
--2022-04-24 23:35:51--  https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279992 (273K) [application/x-httpd-php]
Saving to: 'bike-sharing-dataset/Bike-Sharing-Dataset.zip.1'

     0K .......... .......... .......... .......... .......... 18%  125K 2s
    50K .......... .......... .......... .......... .......... 36%  249K 1s
   100K .......... .......... .......... .......... .......... 54% 12.4M 0s
   150K .......... .......... .......... .......... .......... 73% 1.18M 0s
   200K .......... .......... .......... .......... .......... 91%  312K 0s
   250K .......... .......... ...                             100% 31.5M=0.8s

2022-04-24 23:35:53 (339 KB/s) - 'bike-sharing-dat

In [11]:
df_raw = pd.read_csv('bike-sharing-dataset/day.csv')

## Data Cleaning

In [20]:
df_raw = df_raw.drop(columns=['dteday'])

## Modeling

### Split the dataset

In [21]:
X = df_raw.iloc[:, :-1]
y = df_raw.iloc[:, -1]

### DecisionTreeRegressor

In [27]:
reg = DecisionTreeRegressor(random_state=2)
scores = cross_val_score(reg, X, y, scoring='neg_mean_squared_error', cv=5)
rmse = np.sqrt(-scores)
print('RMSE mean: {:.2f}'.format(rmse.mean()))

RMSE mean: 329.50


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

reg = DecisionTreeRegressor(random_state=2)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

0.0

In [40]:
y_pred = grid_reg.predict(X_test)

reg_mse = mean_squared_error(y_test, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

226.48173909186696

## Hyperparameters

In [75]:
params = {'max_depth': [None, 2, 3, 4, 5, 6, 8, 10, 20],
          'min_samples_leaf': [1, 2, 4, 6, 8, 10, 20, 30]
          
          }

reg = DecisionTreeRegressor(random_state=2)
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_reg.fit(X_train, y_train)
y_pred = grid_reg.predict(X_train)

reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

55.73480339072197

In [73]:
best_score = np.sqrt(-grid_reg.best_score_)
print("Validation score: {:.3f}".format(best_score))

Validation score: 232.315


In [74]:
y_pred = grid_reg.predict(X_test)

reg_mse = mean_squared_error(y_test, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

217.97289733096662