# Decision Tree Regression on Bike Sharing Dataset

Predict the bike rental count hourly or daily basis on the environmental and seasonal settings presented in the dataset.

In [None]:
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Section A - Exploratory Data Analysis

In [None]:
hour_df = pd.read_csv("../input/bike-sharing-dataset/hour.csv")
hour_df.info()

**Columns Information**

- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
	- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
	- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
	- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
	- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered


In [None]:
hour_df.rename(columns={'instant':'rec_id',
                        'dteday':'datetime',
                        'holiday':'is_holiday',
                        'workingday':'is_workingday',
                        'weathersit':'weather_situation',
                        'hum':'humidity',
                        'mnth':'month',
                        'cnt':'total_count',
                        'hr':'hour',
                        'yr':'year'},inplace=True)

In [None]:
hour_df.head()

In [None]:
hour_df.shape

In [None]:
hour_df.isna().sum()

In [None]:
hour_df.duplicated().sum()

In [None]:
hour_df.describe()

As we're not sure of what columns might be important here to figure out the outcome, we'll consider all the columns initially.

In [None]:
hour_df.columns

In [None]:
# Let's see how all the features are related with other variables using scatter plot. - df.columns
sns.pairplot(hour_df, 
             x_vars=hour_df.columns, 
             y_vars=hour_df.columns, 
             kind='scatter')
plt.tight_layout()

In [None]:
# Let's see how all the features are related with other variables using scatter plot. - df.columns
sns.pairplot(hour_df, 
             x_vars=hour_df.columns, 
             y_vars=['total_count'], 
             kind='scatter')
plt.tight_layout()

From the above correlation distribution, it is becoming evident that
* count is related to number of casual and registered users.
* atemp and temp are correlated.

This can be confirmed with heatmap and correlation coefficients.

In [None]:
# Let's see the correlation between different variables using heatmap
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(hour_df.corr(), annot = True, ax=ax)
plt.show()

# Section B

Feature Engineering

In [None]:
hour_df.columns

In [None]:
X = hour_df[['season', 'year', 'month', 'hour', 'is_holiday',
       'weekday', 'is_workingday', 'weather_situation', 'temp',
       'humidity', 'windspeed', 'casual', 'registered']]
y = hour_df['total_count']

# Section C

Model Fit

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [None]:
model_predictions = model.predict(X_test)

In [None]:
print(model.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print('MAE: ', mean_absolute_error(y_test, model_predictions))
print('MSE: ', mean_squared_error(y_test, model_predictions))
print('EVS: ', explained_variance_score(y_test, model_predictions))
print('R2 Score: ', r2_score(y_test, model_predictions))
rmse = mean_squared_error(y_test, model_predictions, squared=False)
print('RMSE: ', rmse)

R2 of 1 indicates that the regression predictions perfectly fit the data.

In [None]:
from sklearn import tree
text_representation = tree.export_text(model)
print(text_representation)

In [None]:
# If we need to print the decision tree
'''
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, 
                   feature_names=['season', 'year', 'month', 'hour', 'is_holiday',
                                  'weekday', 'is_workingday', 'weather_situation', 'temp',
                                  'humidity', 'windspeed', 'casual', 'registered'],  
                   class_names=['total_count'],
                   filled=True)
'''