In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

#Import libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
from matplotlib import style
import math
import seaborn as sns 
import missingno as msno
from datetime import datetime
import statsmodels.formula.api as sm

#import the necessary modelling algos.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV,LassoCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification


#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read Training and Test Data

In [None]:
train_data=pd.read_csv(r'../input/bike-sharing-demand/train.csv')
test_data=pd.read_csv(r'../input/bike-sharing-demand/test.csv')
df=train_data.copy()
test_df=test_data.copy()
df.head()

# **Identify Unique Columns**

In [None]:
df.columns.unique()

# **A SHORT DESCRIPTION OF THE FEATURES**

datetime - hourly date + timestamp

season - 1 = spring, 2 = summer, 3 = fall, 4 = winter

holiday - whether the day is considered a holiday

workingday - whether the day is neither a weekend nor holiday

weather -

1: Clear, Few clouds, Partly cloudy, Partly cloudy

2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist

3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

temp - temperature in Celsius

atemp - "feels like" temperature in Celsius

humidity - relative humidity

windspeed - wind speed

casual - number of non-registered user rentals initiated

registered - number of registered user rentals initiated

count - number of total rentals

In [None]:
df.info()

*ALL THE VARIABLES OR FEATURES ARE NUMERIC AND THE TARGET VARIABLE THAT WE HAVE TO PREDICT IS THE count VARIABLE. HENCE THIS IS A TYPICAL EXAMPLE OF A **REGRESSION PROBLEM** AS THE count VARIABLE IS CONTINUOUS VARIED.*

In [None]:
df.describe() 

# **A. Data Preprocessing**
check Null values

In [None]:
df.isnull().values.any()

df.isnull().sum()  # implies no null values and hence no imputation needed ::).

no missing value.

# **# Data Exploration**

In [None]:
# let us consider season.
df.season.value_counts()

In [None]:
sns.barplot(x='season', y='count', data=df)

In [None]:
sns.factorplot(x='season',data=df,kind='count',size=5,aspect=1.5)

In [None]:
#holiday
df.holiday.value_counts()
sns.barplot(x='holiday', y='count', data=df)
sns.factorplot(x='holiday',data=df,kind='count',size=5,aspect=1) # majority of data is for non holiday days.

In [None]:
#workingday
df.workingday.value_counts()
sns.barplot(x='workingday', y='count', data=df)
sns.factorplot(x='workingday',data=df,kind='count',size=5,aspect=1) # majority of data is for working days.

In [None]:
#weather
df.workingday.value_counts()
sns.barplot(x='weather', y='count', data=df)
sns.factorplot(x='weather',data=df,kind='count',size=5,aspect=1) # majority of data is for weather.

In [None]:
df.describe()

In [None]:
sns.boxplot(data=df[['temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']])
fig=plt.gcf()
fig.set_size_inches(10,10)

In [None]:
dfWithoutOutliers = df[np.abs(df["count"]-df["count"].mean())<=(3*df["count"].std())] 
display("Shape Of the dataframe before Ouliers: ",df.shape)
display("Shape Of the dataframe after Ouliers: ",dfWithoutOutliers.shape)
df =dfWithoutOutliers

In [None]:
# can also be visulaized using histograms for all the continuous variables.
df.temp.unique()
fig,axes=plt.subplots(2,2)
axes[0,0].hist(x="temp",data=df,edgecolor="black",linewidth=2,color='#ff4125')
axes[0,0].set_title("Variation of temp")
axes[0,1].hist(x="atemp",data=df,edgecolor="black",linewidth=2,color='#ff4125')
axes[0,1].set_title("Variation of atemp")
axes[1,0].hist(x="windspeed",data=df,edgecolor="black",linewidth=2,color='#ff4125')
axes[1,0].set_title("Variation of windspeed")
axes[1,1].hist(x="humidity",data=df,edgecolor="black",linewidth=2,color='#ff4125')
axes[1,1].set_title("Variation of humidity")
fig.set_size_inches(10,10)

# **B. Data Wrangling **

In [None]:
#corelation matrix.
cor_mat= df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

INFERENCES FROM THE ABOVE HEATMAP--
self realtion i.e. of a feature to itself is equal to 1 as expected.

temp and atemp are highly related as expected.

humidity is inversely related to count as expected as the weather is humid people will not like to travel on a bike.

also note that casual and working day are highly inversely related as you would expect.

Also note that count and holiday are highly inversely related as you would expect.

Also note that temp(or atemp) highly effects the count.

Also note that weather and count are highly inversely related. This is bcoz for uour data as weather increases from (1 to 4) implies that weather is getting more worse and so lesser people will rent bikes.

registered/casual and count are highly related which indicates that most of the bikes that are rented are registered.

similarly we can draw some more inferences like weather and humidity and so on... .

FEATURE ENGINEERING AND GET SOME NEW FEATURES AND DROP SOME USELESS OR LESS RELEVANT FEATURES

In [None]:
# # seperating season as per values. this is bcoz this will enhance features.
season=pd.get_dummies(df['season'],prefix='season')
df=pd.concat([df,season],axis=1)
df.head()
season=pd.get_dummies(test_df['season'],prefix='season')
test_df=pd.concat([test_df,season],axis=1)
test_df.head()

In [None]:
# # # same for weather. this is bcoz this will enhance features.
weather=pd.get_dummies(df['weather'],prefix='weather')
df=pd.concat([df,weather],axis=1)
df.head()
weather=pd.get_dummies(test_df['weather'],prefix='weather')
test_df=pd.concat([test_df,weather],axis=1)
test_df.head()

In [None]:
# # # now can drop weather and season.
df.drop(['season','weather'],inplace=True,axis=1)
df.head()
test_df.drop(['season','weather'],inplace=True,axis=1)
test_df.head()


# # # also I dont prefer both registered and casual but for ow just let them both.

In [None]:
df["hour"] = [t.hour for t in pd.DatetimeIndex(df.datetime)]
df["day"] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)]
df["month"] = [t.month for t in pd.DatetimeIndex(df.datetime)]
df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)]
df['year'] = df['year'].map({2011:0, 2012:1})
df.head()

In [None]:
test_df["hour"] = [t.hour for t in pd.DatetimeIndex(test_df.datetime)]
test_df["day"] = [t.dayofweek for t in pd.DatetimeIndex(test_df.datetime)]
test_df["month"] = [t.month for t in pd.DatetimeIndex(test_df.datetime)]
test_df['year'] = [t.year for t in pd.DatetimeIndex(test_df.datetime)]
test_df['year'] = test_df['year'].map({2011:0, 2012:1})
test_df.head()

In [None]:
# now can drop datetime column.
df.drop('datetime',axis=1,inplace=True)
df.head()

# Find Correlation for NEW FEATURES

In [None]:
cor_mat= df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

In [None]:
df.drop(['casual','registered'],axis=1,inplace=True)
df.head()

*COUNT VARIATION WITH DIFFERENT FEATURES*

In [None]:
sns.factorplot(x="hour",y="count",data=df,kind='bar',size=5,aspect=1.5)
# note that time of day affects wheteher people take bike or not. like night time lesser bikes used and using for office commute

sns.factorplot(x="day",y='count',kind='bar',data=df,size=5,aspect=1)
# note that day has lesser affects wheteher people take bike or not. 

sns.factorplot(x="month",y="count",data=df,kind='bar',size=5,aspect=1.5)
# note that month affects season and that effects wheteher people take bike or not. like climate conditions rainy,hazy etc... .

sns.factorplot(x="year",y="count",data=df,kind='bar',size=5,aspect=1.5)
# 0 for 2011 and 1 for 2012. Hence demand has increased over the years.

In [None]:
# for temp using scatter plot as random values 
plt.scatter(x="temp",y="count",data=df,color='green')

# note that this way this is hard to visualze. 
# a better way is to convert the 'temp' variable into intervals or so called bins and then treat it like a discrete variable

df_temp=df.copy()
df_temp.temp.describe()
df_temp['temp_bin']=np.floor(df_temp['temp'])//5
df_temp['temp_bin'].unique()
# now we can visualize as follows
sns.factorplot(x="temp_bin",y="count",data=df_temp,kind='bar')
#now the demand is highest for bins 6 and 7 which is about tempearure 30-35(bin 6) and 35-40 (bin 7).

# **DATA MODELLING**

In [None]:
df.head()

In [None]:
df.columns.to_series().groupby(df.dtypes).groups

In [None]:
df.dtypes

# Applying Machine Learning Models

In [None]:
X, y = df.iloc[:, :], df['count']
X = X.drop('count',axis=1)
X.head()

In [None]:
#Normalize the train set
#def norm_func(i):
    #x = (i-i.min())	/ (i.max()-i.min())
    #return (x)
from sklearn.preprocessing import StandardScaler
scl= StandardScaler()
#X = scl.fit_transform(X)
#y = scl.fit_transform(y)

from sklearn.model_selection import  train_test_split
#x_train,x_test,y_train,y_test=train_test_split(df.drop('count',axis=1),df['count'],test_size=0.25,random_state=42)
#x_train,x_test,y_train,y_test=train_test_split(df.drop('count',axis=1),df['count'],test_size=0.2,random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# **C. Ridge Regression**

In [None]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas

In [None]:
coefs = []
#from sklearn.linear_model import Ridge
for a in alphas:
    model = Ridge(alpha=a, normalize=True)
    model.fit(X_train,y_train)
    coefs.append(model.coef_)
    score = model.score(X_train,y_train)
    pred2 = model.predict(X_test)
    mse = mean_squared_error(y_test, pred2) 
    print("Alpha:{0:.6f}, R2:{1:.3f}, MSE:{2:.2f}, RMSE:{3:.2f}".format(a, score, mse, np.sqrt(mse)))


In [None]:
np.shape(coefs)

In [None]:
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

In [None]:
#Alpha:0.015269, R2:0.396, MSE:19938.35, RMSE:141.20
ridge_mod=Ridge(alpha=0.015269, normalize=True).fit(X_train,y_train)
pred2 = ridge_mod.predict(X_test)
score = model.score(X_test,y_test)
Rmse = mean_squared_error(y_test,pred2)
print("R2:{0:.3f}, MSE:{1:.2f}, RMSE:{2:.2f}"
   .format(score, Rmse,np.sqrt(mse))) 
print(pd.Series(ridge_mod.coef_, index = X.columns)) # Print coefficients
print(mean_squared_error(y_test, pred2))          # Calculate the test MSE


In [None]:
#Alpha:0.015269, R2:0.396, MSE:19938.35, RMSE:141.20
ridge_cv=RidgeCV(alphas=alphas,scoring = 'neg_mean_squared_error', normalize = True, store_cv_values=True)
ridge_cv_mod = ridge_cv.fit(X_train,y_train)
print(ridge_cv_mod.alpha_)
#0.01
#print(np.mean(ridge_mod.cv_values_, axis=0))


print(ridge_mod.intercept_)
print(ridge_mod.coef_)

print(ridge_cv_mod.intercept_)
print(ridge_cv_mod.coef_)

In [None]:
sns.regplot(y_test,pred2)
plt.title('Residual Analysis - Ridge_Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

In [None]:
from sklearn import metrics
print("MAE:", metrics.mean_absolute_error(y_test,pred2))
print('MSE:', metrics.mean_squared_error(y_test, pred2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred2)))

*The 10-Fold Cross-Validation Error 2*

In [None]:
# grid search hyperparameters for ridge regression
from numpy import arange
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

In [None]:
# define model
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('MSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


# **D. Lasso Regression**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
coefs = []
#from sklearn.linear_model import Ridge
for a in alphas:
    Lmodel = Lasso(alpha=a, normalize=True)
    Lmodel.fit(X_train,y_train)
    coefs.append(Lmodel.coef_)
    Lscore = Lmodel.score(X_train,y_train)
    Lpred2 = Lmodel.predict(X_test)
    mse = mean_squared_error(y_test, Lpred2) 
    print("Alpha:{0:.6f}, R2:{1:.3f}, MSE:{2:.2f}, RMSE:{3:.2f}".format(a, Lscore, mse, np.sqrt(mse)))



In [None]:
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression

#Alpha:0.005000, R2:0.396, MSE:19959.38, RMSE:141.28
lasso_cv=LassoCV(cv=20)
lasso_cv_mod = lasso_cv.fit(X_train,y_train)
print(lasso_cv_mod.alpha_)



In [None]:
print(Lmodel.intercept_)
print(Lmodel.coef_)

In [None]:
np.shape(coefs)



In [None]:
#Alpha:0.005000, R2:0.400, MSE:19959.38, RMSE:141.28

ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

# **Cross Validation Lasso**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# evaluate an lasso regression model on the dataset
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score

#Alpha:0.005000, R2:0.396, MSE:19959.38, RMSE:141.28
Lmodel = Lasso(alpha=0.005000, normalize=True)
Lmodel.fit(X_train,y_train)
coefs.append(Lmodel.coef_)
Lscore = Lmodel.score(X_train,y_train)
Lpred2 = Lmodel.predict(X_test)
mse = mean_squared_error(y_test, Lpred2) 


# define model evaluation method
Lcv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(Lmodel, grid, scoring='neg_mean_absolute_error', cv=Lcv, n_jobs=-1)
# evaluate model
scores = cross_val_score(Lmodel, X_train, y_train, scoring='neg_mean_absolute_error', cv=Lcv, n_jobs=-1)

# perform the search
results = search.fit(X_train, y_train)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
sns.regplot(y_test,Lpred2)
plt.title('Residual Analysis - Lasso Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

In [None]:
print("MAE:", metrics.mean_absolute_error(y_test,Lpred2))
print('MSE:', metrics.mean_squared_error(y_test, Lpred2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, Lpred2)))

In [None]:
# 10 fold CV
from sklearn.linear_model import LassoCV
## define model evaluation method
cv = RepeatedKFold(n_splits=10, random_state=1)
# define model
model = LassoCV(alphas=arange(0, 1, 0.01), cv=cv)
# fit model
model.fit(X_train, y_train)
# summarize chosen configuration
print('alpha: %f' % model.alpha_)

# **E1. Regression Tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg4 = DecisionTreeRegressor()
reg4.fit(X_train,y_train)
reg4.score(X_train,y_train)

In [None]:
Dec_Tree = reg4.predict(X_test)
Dec_Tree

In [None]:
sns.regplot(y_test,Dec_Tree)
plt.title('Residual Analysis - Decision Tree Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

In [None]:
print("MAE:", metrics.mean_absolute_error(y_test,Dec_Tree ))
print('MSE:', metrics.mean_squared_error(y_test, Dec_Tree))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, Dec_Tree)))

In [None]:
# 10 Fold
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(reg4, parameters,scoring='neg_mean_squared_error', cv=10)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

# **E2. Decision Tree with Pruning**

In [None]:
# Pruning the Tree
from sklearn.metrics import mean_squared_error, r2_score
# Minimum observations at the internal node approach
regtree2 = DecisionTreeRegressor(min_samples_split = 3)
regtree2.fit(X_train, y_train)

In [None]:
# Prediction
test_pred2 = regtree2.predict(X_test)
train_pred2 = regtree2.predict(X_train)

In [None]:
# Error on test dataset
mean_squared_error(y_test, test_pred2)
r2_score(y_test, test_pred2)

In [None]:
# Error on train dataset
mean_squared_error(y_train, train_pred2)
r2_score(y_train, train_pred2)

In [None]:
## Minimum observations at the leaf node approach
regtree3 = DecisionTreeRegressor(min_samples_leaf = 3)
regtree3.fit(X_train, y_train)

# Prediction
test_pred3 = regtree3.predict(X_test)
train_pred3 = regtree3.predict(X_train)

# measure of error on test dataset
mean_squared_error(y_test, test_pred3)
r2_score(y_test, test_pred3)

# measure of error on train dataset
mean_squared_error(y_train, train_pred3)
r2_score(y_train, train_pred3)

In [None]:
# 10 Fold DT pruning with leaf node apporoach
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(regtree3, parameters,scoring='neg_mean_squared_error' ,cv=10)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

In [None]:
# 10 Fold DT pruning internal node approach
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(regtree2, parameters,scoring='neg_mean_squared_error', cv=10)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

# **F. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 400, criterion='mse',random_state=1, n_jobs=-1)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
#Root_Mean_Square_Log_Error(RMSE) is accuracy criteria for this problem
print('RMSLE train: %.3f' % np.sqrt(mean_squared_error(np.log(y_train + 1), np.log(y_train_pred + 1))))
print('RMSLE test: %.3f' % np.sqrt(mean_squared_error(np.log(y_test + 1), np.log(y_test_pred + 1))))
print('R2 train: %.3f' % r2_score(y_train, y_train_pred))
print('R2 test: %.3f' % r2_score(y_test, y_test_pred))

In [None]:
#model = RandomForestClassifier()
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
# evaluate the model
model = RandomForestRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(forest, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('MSE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
sns.regplot(y_test,y_test_pred)
plt.title('Residual Analysis - Random Forest Regression')
plt.xlabel('Observed')
plt.ylabel('Residual')
plt.show()

The best result given by Random Forest across A-H.

# **G. Gradient Boosting**

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#model1 = GradientBoostingRegressor()

model1 = GradientBoostingRegressor(n_estimators=250, learning_rate=0.1, max_depth=7,subsample=0.9, random_state=42,loss='ls', verbose=2).fit(X_train, y_train)

In [None]:
# fit the model on the whole dataset
model1.fit(X_train, y_train)

In [None]:
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model
n_scores = cross_val_score(model1, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# report performance
print('MSE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


# **H. Optimal Model**

Random Forest with 10 fold cross validation was giving the least error and high accuracy across all the models from A-H
Similar apporach within train and validation can be apply to the given test data

# **Compute the MSE for the Test Data & Compare with the CV Error**

In [None]:
df.describe()

In [None]:
#x_train,x_test,y_train,y_test=train_test_split(df.drop('count',axis=1),df['count'],test_size=0.20,random_state=0)
X_train,X_test,y_train,y_test=train_test_split(df.drop('count',axis=1),df['count'],test_size=0.25,random_state=42)

In [None]:
models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),KNeighborsRegressor()]
model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','KNeighborsRegressor']
rmsle=[]
mse=[]
d={}
for model in range (len(models)):
    clf=models[model]
    clf.fit(X_train,y_train)
    test_pred=clf.predict(X_test)
    rmsle.append(np.sqrt(mean_squared_log_error(test_pred,y_test)))
    mse.append(mean_squared_error(test_pred,y_test))
d={'Modelling Algo':model_names,'RMSLE':rmsle,'MSE':mse}   
d
    

In [None]:
rmsle_frame=pd.DataFrame(d)
rmsle_frame

In [None]:
sns.factorplot(x='Modelling Algo',y='RMSLE',data=rmsle_frame,kind='bar',size=5,aspect=2)
sns.factorplot(x='Modelling Algo',y='MSE',data=rmsle_frame,kind='bar',size=5,aspect=2)

In [None]:
#for random forest regresion.
no_of_test=[1000]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_rf.fit(X_train,y_train)
pred=clf_rf.predict(X_test)
print((np.sqrt(mean_squared_log_error(pred,y_test))))

In [None]:
clf_rf.best_params_

RANDOM FORETS REGRESSOR GIVES THE LEAST RMSLE. HENCE WE USE IT TO MAKE PREDICTIONS

In [None]:
pred=clf_rf.predict(test_df.drop('datetime',axis=1))
d={'datetime':test_data['datetime'],'count':pred}
ans=pd.DataFrame(d)
ans.to_csv('submission.csv',index=False) # saving to a csv file for predictions.