In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import the necessary modules

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [None]:
# Load the train data

train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv', index_col=False)


In [None]:
# Metadata from Kaggle

# datetime - hourly date + timestamp  
# season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
# holiday - whether the day is considered a holiday
# workingday - whether the day is neither a weekend nor holiday
# weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
# 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
# 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
# 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
# temp - temperature in Celsius
# atemp - "feels like" temperature in Celsius
# humidity - relative humidity
# windspeed - wind speed
# casual - number of non-registered user rentals initiated
# registered - number of registered user rentals initiated
# count - number of total rentals


In [None]:
# Understand the data

train.head()

In [None]:
# Understand the data

train.info()

# At a glance, it looks like there is no missing values


In [None]:
# Understand the data

(train == 0).any()

# there are some columns where value = 0 which make sense
# the only column which having 0 is questionable is 'count'
# although there is 0 values in casual and registered count, there is no zero values in total count, which is ok

In [None]:
# Understand the data

train.min()

# there is no unusual / negative values that we should not be expecting. For eg: count columns


In [None]:
# Understand the data

print(train.datetime.min())
print(train.datetime.max())
print(train.datetime.nunique())

# data is from 1st Jan 2011 to 19 Dec 2012
# no duplicate datetime entry


In [None]:
# Understand the data

# although all data (except datetime) are numerical/continuous, 
# season, holiday, workingday, weather are in fact categorical
# holiday and workingday are already encoded into 0/1 binary
# season and weather are encoded in ordinal fashion - is it better to convert this to one-hot encoding?


In [None]:
# Understand the data

print(train.holiday.value_counts(), "\n")
print(train.workingday.value_counts(), "\n")
print(train.season.value_counts(), "\n")
print(train.weather.value_counts())

# no weird values for season and weather (all as per the metadata)


In [None]:
# Visualize / Preliminary Observation

sns.pairplot(x_vars=['temp', 'atemp', 'humidity', 'windspeed'], 
                     y_vars='count', data=train, diag_kind=None, height=7, aspect=0.7)



# at a glance, there does not seem to be any strong pattern / relationship between temperature, humidity and wind speed with total rentals
# however, we can see instances of more rentals when temperature and humidity is moderate (not too low/high)
# and also inverse relationship between wind speed and rentals


In [None]:
# Visualize / Preliminary Observation

sns.pairplot(x_vars=['weather', 'season', 'holiday', 'workingday'], 
                     y_vars='count', data=train, diag_kind=None, height=7, aspect=0.7)

# higher rental counts observed when weather is conducive (ie clear/few cloud) and almost none when it is rainy/stormy/snowy  
# higher rental counts on fall/winter (which is quite contradictory to above observation on weather)
# likewise, higher rental counts observed for non-holiday / workig day


In [None]:
# Add date and time columns for additional level of detail

import datetime

train['year_month'] = pd.to_datetime(train['datetime']).dt.to_period('M')
train['year'] = pd.DatetimeIndex(train['datetime']).year
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['hour'] = pd.DatetimeIndex(train['datetime']).hour


display(train.head())
print(train.info())

In [None]:
# Investigate the season (because observation on season above seems to be contradictory)

train.groupby(['season','month'])['count'].sum()

# Season appears to be labelled wrongly, does not match the right season

In [None]:
# Visualize / Preliminary observation

sns.lineplot(x='month', y='count', hue='year', data=train, ci=None, estimator='sum')

# no of rentals registering growth in 2012 vs 2011
# generally, no of rentals peaking in warmer summer and early fall months

In [None]:
# Visualize / Preliminary observation

plt.figure(figsize=(20,8))
sns.barplot(x='year_month', y='count', data=train, estimator=sum, ci=None, color='lightgreen')

# no of rentals registering growth in 2012 vs 2011
# generally, no of rentals peaking in warmer summer and early fall months


In [None]:
# Visualize / Preliminary Observation

plt.figure(figsize=(20,8))
sns.barplot(x='hour', y='count', data=train, estimator=sum, ci=None, hue='year')

# Rental counts peaking at 8 am and 5-6 pm


In [None]:
# Check correlation between variables

corrmat = train.corr()

plt.figure(figsize=(12,9))
sns.heatmap(corrmat, annot=True, fmt='.2f')

# features with the highest correlation with count = temp, atemp, humidity, hour
# ignore casual / registered -> high correlation because both add up to count
# even though windspeed, holiday, workingday, month and weather do not show high correlation, there is some relationship with count, just that it is not linear as we observe from the earlier visualizations

# features with high collinearity:
# season/month
# temp/atemp
# humidity/weather

# interesting to observe that casual rental is more correlated with tempetature (temp/atemp) --> rent on a whim depending on temperature
# and registered rental is more correlated with hour --> rent for regular use at similar timing



In [None]:
train.info()

In [None]:
# Define x_train and y_train
# remove season (labelled wrongly and also linked to month)
# remove temp (correlated with atemp)
# remove casual and registered (add up to count)
# for time period - retain year, month and hour because results vary by month/hour and also year with YOY growth

x_train = train.drop(['datetime', 'season', 'temp', 'casual', 'registered', 'count', 'year_month'], axis=1)

y_train = train['count']

print(x_train.columns)


In [None]:
# Rearrange columns

x_train = x_train[['atemp', 'humidity', 'windspeed', 'hour', 'month', 'holiday', 'workingday', 'year', 'weather']]
print(x_train.columns)



In [None]:
# Make pipeline:
# no need to impute missing values as there are none
# perform scaling on numerical columns (exclude those to be encoded)
# one-hot encoding on weather


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

scaling_transformer = make_column_transformer(
(StandardScaler(), list(range(0,5))),
    remainder='passthrough'
)

encoding_transformer = make_column_transformer(
(OneHotEncoder(handle_unknown='ignore'), [7,8]),
    remainder='passthrough'
)

full_transformer = make_pipeline(
scaling_transformer, encoding_transformer,
)

In [None]:
# Define formula to get prediction scores

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

def get_prediction_scores(x_train, y_train, model=LinearRegression()):
    
    full_pipeline = make_pipeline(full_transformer, model)
    
    full_pipeline.fit(x_train, y_train)
    
    y_pred_train = full_pipeline.predict(x_train)
    
    print(f'Train rMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train))}')
    print(f'Train r2: {r2_score(y_train, y_pred_train)}')
 

In [None]:
# Test Model 1 : Linear Regression

model=LinearRegression()

get_prediction_scores(x_train, y_train, model)


In [None]:
# Test Model 2 : Ridge

model=Ridge(alpha=1)

get_prediction_scores(x_train, y_train, model)


In [None]:
# Test Model 3 : Lasso

model=Lasso(alpha=1)

get_prediction_scores(x_train, y_train, model)


In [None]:
# Test Model 4 : ElasticNet

model=ElasticNet(alpha=1)

get_prediction_scores(x_train, y_train, model)


In [None]:
# Test Model 5 : Decision Tree

from sklearn.tree import DecisionTreeRegressor

model=DecisionTreeRegressor(max_depth=10)

get_prediction_scores(x_train, y_train, model)


In [None]:
# Test Model 6 : Random Forest

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=10)

get_prediction_scores(x_train, y_train, model)


In [None]:
# Random forest regressor seem to produce the best predictive accuracy compared to other models
# we shall do cross-validation on the training data

from sklearn.model_selection import cross_val_score

chosen_pipeline = make_pipeline(full_transformer, RandomForestRegressor(max_depth=10))

cv_score = cross_val_score(chosen_pipeline, x_train, y_train, cv=10)

print(f'cv_score: {cv_score}')
print(f'mean cv_score: {cv_score.mean()}') # or np.mean(cv_score)
print(f'variance cv_score: {cv_score.var()}') # or np.var(cv_score)

# quite consistent results with low variance


In [None]:
# We shall now try to fine tune the hyperparameter using cross validation and grid search

from sklearn.model_selection import GridSearchCV

chosen_pipeline.named_steps


In [None]:
param = {'randomforestregressor__max_depth': [5,8,10]}

grid_search = GridSearchCV(chosen_pipeline, param, cv=5)
grid_search.fit(x_train, y_train)



In [None]:
grid_search.best_estimator_

# now that we have chosen the model (RandomForestRegressor with max_depth=10)
# we shall use this to predict y_test


In [None]:
# Load the test data

test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv', index_col=False)



In [None]:
# Perform the same exploratory data analysis as train data

display(test.head())
print(test.info())

# Likewise, at a glance, it looks like there is no missing values

In [None]:
# Perform the same exploratory data analysis as train data

(test == 0).any()

# there is no unusual/unexpected zero values => ok


In [None]:
# Perform the same exploratory data analysis as train data

test.min()

# there is also no unusual / negative values that we should not be expecting


In [None]:
# Perform the same exploratory data analysis as train data

print(test.datetime.min())
print(test.datetime.max())
print(test.datetime.nunique())

# data is from 20th Jan 2011 to 31 Dec 2012 => as per the data explanation that test data is from the 20th of the month till end of the month
# no duplicate datetime entry

In [None]:
# Perform the same exploratory data analysis as train data

print(test.holiday.value_counts(), "\n")
print(test.workingday.value_counts(), "\n")
print(test.season.value_counts(), "\n")
print(test.weather.value_counts())

# likewise, no weird values for season and weather (all as per the metadata)

In [None]:
# Add date and time columns for additional level of detail

test['year_month'] = pd.to_datetime(test['datetime']).dt.to_period('M')
test['year'] = pd.DatetimeIndex(test['datetime']).year
test['month'] = pd.DatetimeIndex(test['datetime']).month
test['hour'] = pd.DatetimeIndex(test['datetime']).hour


display(test.head())
print(test.info())

In [None]:
# Define x_test

x_test = test.drop(['datetime', 'season', 'temp', 'year_month'], axis=1)


print(x_test.columns)

In [None]:
# Rearrange columns

x_test = x_test[['atemp', 'humidity', 'windspeed', 'hour', 'month', 'holiday', 'workingday', 'year', 'weather']]
print(x_test.columns)

In [None]:
x_test.columns

In [None]:
# now that we have chosen the model (RandomForestRegressor with max_depth=10)
# we shall use this to predict y

def get_predicted_y(x_train, y_train, x_test, model=RandomForestRegressor(max_depth=10)):
    
    new_pipeline = make_pipeline(full_transformer, model)
    
    new_pipeline.fit(x_train, y_train)
    
    y_test = pd.Series(new_pipeline.predict(x_test), name='count')
    
    y_test = pd.concat([test['datetime'], y_test], axis=1)
    
    y_test.to_csv('submission.csv', index=False)



In [None]:
model=RandomForestRegressor(max_depth=10)

get_predicted_y(x_train, y_train, x_test, model)
