In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

#Import libraries

#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import seaborn as sns 
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from datetime import datetime
import statsmodels.formula.api as sm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Read Training and Test Data**

In [None]:
train_data=pd.read_csv(r'../input/bike-sharing-demand/train.csv')
test_data=pd.read_csv(r'../input/bike-sharing-demand/test.csv')
df=train_data.copy()
test_df=test_data.copy()
df.head()

In [None]:
#Describe dataset
train_data.describe() 

# # B. Data Preprocessing

In [None]:
#check Null values
train_data.isnull().values.any()

In [None]:
# Data Exploration
sns.barplot(x='season', y='count', data=train_data)

In [None]:
sns.barplot(x='weather', y='count', data=train_data)

In [None]:
train_data[['count', 'holiday']].groupby(['holiday'], as_index = True).mean().sort_values(by = 'count')

In [None]:
train_data[['count', 'season']].groupby(['season'], as_index = True).mean().sort_values(by = 'count')

In [None]:
#we have a datetime object here, so it's better to break them into hour, day, month, year and make them a separate column.
train_data["hour"] = [t.hour for t in pd.DatetimeIndex(train_data.datetime)]
train_data["day"] = [t.dayofweek for t in pd.DatetimeIndex(train_data.datetime)]
train_data["month"] = [t.month for t in pd.DatetimeIndex(train_data.datetime)]
train_data['year'] = [t.year for t in pd.DatetimeIndex(train_data.datetime)]

In [None]:
#Box plot
fig, axes = plt.subplots(nrows=3,ncols=2)
fig.set_size_inches(15, 15)
sns.boxplot(data=train_data,y="count",orient="v",ax=axes[0][0])
sns.boxplot(data=train_data,y="count",x="month",orient="v",ax=axes[0][1])
sns.boxplot(data=train_data,y="count",x="weather",orient="v",ax=axes[1][0])
sns.boxplot(data=train_data,y="count",x="workingday",orient="v",ax=axes[1][1])
sns.boxplot(data=train_data,y="count",x="hour",orient="v",ax=axes[2][0])
sns.boxplot(data=train_data,y="count",x="temp",orient="v",ax=axes[2][1])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='Month', ylabel='Count',title="Box Plot On Count Across Months")
axes[1][0].set(xlabel='Weather Situation', ylabel='Count',title="Box Plot On Count Across Weather Situations")
axes[1][1].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count Across Working Day")
axes[2][0].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hour Of The Day")
axes[2][1].set(xlabel='Temperature', ylabel='Count',title="Box Plot On Count Across Temperature")

In [None]:
# Dropping datetime column becuase we already break them and created new columns
train_data.drop('datetime',axis=1,inplace=True) 
 

In [None]:
# correlation by pairplot
sns.pairplot(train_data)

In [None]:
# Dropping holiday column as it is highly correlated to‘workingday’ column
train_data.drop('holiday',axis=1,inplace=True) 
 

In [None]:
# Dropping atemp column as it is highly correlated to ‘temp’ column
train_data.drop('atemp',axis=1) 

In [None]:
# there are just 2 different years 2011,2012 so using map(), I converted 2011 and 2012 to 0 and 1 respectively.
train_data['year'] = train_data['year'].map({2011:0, 2012:1})


In [None]:
# finding the correlation between the columns 'casual','registred','count'

plt.scatter(x = train_data['casual'] + train_data['registered'], y = train_data['count'])
plt.show()

In [None]:
# Dropping the column registred and casual
train_data = train_data.drop(['registered', 'casual'],axis=1)

# #B. Applying Machine Learning Models

In [None]:
X, y = train_data.iloc[:, :], train_data['count']

In [None]:
X = X.drop('count',axis=1)

In [None]:
#Normalize the train set
#def norm_func(i):
    #x = (i-i.min())	/ (i.max()-i.min())
    #return (x)
from sklearn.preprocessing import StandardScaler
scl= StandardScaler()

In [None]:
#X = scl.fit_transform(X)
#y = scl.fit_transform(y)

In [None]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:

X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

# # C. Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
reg5 = Ridge(alpha=0.05, normalize=True)
reg5.fit(X_train,y_train)
reg5.score(X_train,y_train)

In [None]:
Ridge = reg5.predict(X_test)
Ridge

In [None]:
print(reg5.intercept_)
print(reg5.coef_)

In [None]:
sns.regplot(y_test,Ridge)
plt.title('Residual Analysis - Ridge_Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

In [None]:
from sklearn import metrics
print("MAE:", metrics.mean_absolute_error(y_test,Ridge))
print('MSE:', metrics.mean_squared_error(y_test, Ridge))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, Ridge)))

In [None]:
# grid search hyperparameters for ridge regression
from numpy import arange
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge


In [None]:
# define model
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('MSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

# # D. Lasso Regression


In [None]:
from sklearn.linear_model import Lasso
reg6 = Lasso(alpha=0.3, normalize=True)
reg6.fit(X_train,y_train)
reg6.score(X_train,y_train)

In [None]:
Lasso = reg6.predict(X_test)
Lasso

In [None]:
print(reg6.intercept_)
print(reg6.coef_)

In [None]:
sns.regplot(y_test,Lasso)
plt.title('Residual Analysis - Lasso Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

In [None]:
print("MAE:", metrics.mean_absolute_error(y_test,Lasso))
print('MSE:', metrics.mean_squared_error(y_test, Lasso))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, Lasso)))

In [None]:
# 10 fold CV
from sklearn.linear_model import LassoCV
## define model evaluation method
cv = RepeatedKFold(n_splits=10, random_state=1)
# define model
model = LassoCV(alphas=arange(0, 1, 0.01), cv=cv)
# fit model
model.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % model.alpha_)

# # E. Regression Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg4 = DecisionTreeRegressor()
reg4.fit(X_train,y_train)
reg4.score(X_train,y_train)

In [None]:
Dec_Tree = reg4.predict(X_test)
Dec_Tree

In [None]:
sns.regplot(y_test,Dec_Tree)
plt.title('Residual Analysis - Decision Tree Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

In [None]:
print("MAE:", metrics.mean_absolute_error(y_test,Dec_Tree ))
print('MSE:', metrics.mean_squared_error(y_test, Dec_Tree))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, Dec_Tree)))

In [None]:
# 10 Fold
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(reg4, parameters,scoring='neg_mean_squared_error', cv=10)
clf.fit(X=X_train, y=y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

# F.Decision Tree with Pruning

In [None]:
# Pruning the Tree
from sklearn.metrics import mean_squared_error, r2_score
# Minimum observations at the internal node approach
regtree2 = DecisionTreeRegressor(min_samples_split = 3)
regtree2.fit(X_train, y_train)

In [None]:
# Prediction
test_pred2 = regtree2.predict(X_test)
train_pred2 = regtree2.predict(X_train)

In [None]:
# Error on test dataset
mean_squared_error(y_test, test_pred2)
r2_score(y_test, test_pred2)

In [None]:
# Error on train dataset
mean_squared_error(y_train, train_pred2)
r2_score(y_train, train_pred2)

In [None]:
## Minimum observations at the leaf node approach
regtree3 = DecisionTreeRegressor(min_samples_leaf = 3)
regtree3.fit(X_train, y_train)

# Prediction
test_pred3 = regtree3.predict(X_test)
train_pred3 = regtree3.predict(X_train)

# measure of error on test dataset
mean_squared_error(y_test, test_pred3)
r2_score(y_test, test_pred3)

# measure of error on train dataset
mean_squared_error(y_train, train_pred3)
r2_score(y_train, train_pred3)

In [None]:
# 10 Fold DT pruning with leaf node apporoach
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(regtree3, parameters,scoring='neg_mean_squared_error' ,cv=10)
clf.fit(X=X_train, y=y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

In [None]:
# 10 Fold DT pruning internal node approach
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(regtree2, parameters,scoring='neg_mean_squared_error', cv=10)
clf.fit(X=X_train, y=y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

# # G. Random Forest

In [None]:

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 400, criterion='mse',random_state=1, n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
#Root_Mean_Square_Log_Error(RMSE) is accuracy criteria for this problem
print('RMSLE train: %.3f' % np.sqrt(mean_squared_error(np.log(y_train + 1), np.log(y_train_pred + 1))))
print('RMSLE test: %.3f' % np.sqrt(mean_squared_error(np.log(y_test + 1), np.log(y_test_pred + 1))))
print('R2 train: %.3f' % r2_score(y_train, y_train_pred))
print('R2 test: %.3f' % r2_score(y_test, y_test_pred))

In [None]:
#model = RandomForestClassifier()
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
# evaluate the model
model = RandomForestRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(forest, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('MSE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


In [None]:
sns.regplot(y_test,y_test_pred)
plt.title('Residual Analysis - Random Forest Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

The best result given by Random Forest across A-H.

# # H. Gradient Boosting

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model1 = GradientBoostingRegressor()

In [None]:
# fit the model on the whole dataset
model1.fit(X_train, y_train)

In [None]:
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model
n_scores = cross_val_score(model1, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# report performance
print('MSE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# # I. Optimal Model

*Random Forest with 10 fold cross validation was giving the least error and high accuracy across all the models from A-H

Similar apporach within train and validation can be apply to the given test data*

In [None]:
test_data.head()

In [None]:
test_data.isnull().values.any()  # checking missing entries

similarly converting datetime to hour , month and year 

In [None]:
test_data["hour"] = [t.hour for t in pd.DatetimeIndex(test_data.datetime)]
test_data["day"] = [t.dayofweek for t in pd.DatetimeIndex(test_data.datetime)]
test_data["month"] = [t.month for t in pd.DatetimeIndex(test_data.datetime)]
test_data['year'] = [t.year for t in pd.DatetimeIndex(test_data.datetime)]
test_data['year'] = test_data['year'].map({2011:0, 2012:1})

In [None]:
test_data =test_data.drop('atemp',axis=1) 


In [None]:
X_test=test_data.iloc[:,1:]

In [None]:
X_test = scl.transform(X_test)

In [None]:
y_test=forest.predict(X_test) # Random Forest 

In [None]:
y_test

In [None]:
y_test = pd.DataFrame(y_test)

In [None]:
df_final = test_data


In [None]:
df_final['count'] = np.round(y_test)

In [None]:
df_final = df_final.drop(['season', 'workingday','weather', 'holiday',
                            'temp', 'humidity', 'windspeed', 'hour', 'day', 'month', 'year'], axis=1)

In [None]:
df_final.head()

In [None]:
df_final.to_csv('submission.csv', index=False)
