In [None]:
import pandas as pd
import calendar
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import boxcox, inv_boxcox
from datetime import datetime
from numpy import arange
from pandas import read_csv
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


train_df=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
train_df.head(6)

In [None]:
#Fig Will show count distribution
sns.distplot(train_df['count'])
plt.show()


In [None]:
#Fig Will show count distribution post log transformation
train_df['count']=train_df['count'].apply(lambda x:np.log(x))
sns.distplot(train_df['count'])
plt.show()
print (train_df['count'])
train_df.shape

In [None]:
#Dropping Outliers beuond 99 Percentile
cnt=train_df['count'].values
q99=np.percentile(cnt,[99])
train_df=train_df[train_df['count']<q99[0]]
train_df.shape

In [None]:
#Visualize Count wrt categorical variables
cat_names=['season', 'holiday', 'workingday', 'weather']
i=0
for name in cat_names:
    i=i+1
    plt.subplot(2,2,i)
    sns.countplot(name,data=train_df) 
plt.show()

In [None]:
#Visualize data wrt continous variables. 
cont_names=['temp','atemp','humidity','windspeed']
i=0
for name in cont_names:
    i=i+1
    plt.subplot(2,2,i)
    sns.boxplot(name,data=train_df) 
plt.show()
#Windspeed seems to be skewed

In [None]:
#Splitting out Datetime attribute in dataframe and dropping unwanted variables as per before analysis
new_df=train_df.copy(deep=True)
new_df['day']=new_df['datetime'].apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d %H:%M:%S").weekday()])
new_df['datetime'] = pd.to_datetime(new_df['datetime'], format='%Y-%m-%d %H:%M:%S')
new_df['month']=new_df['datetime'].apply(lambda x:x.month)
new_df['hour']=new_df['datetime'].apply(lambda x:x.hour)
new_df['year']=new_df['datetime'].apply(lambda x:x.year)
final_df=new_df.copy(deep=True)
final_df=new_df.drop(['datetime','temp','casual','registered'], axis=1)
final_df.head()

In [None]:
#adding dummy varibles to categorical variables dropping the souce columns
weather_df=pd.get_dummies(final_df['weather'],prefix='w',drop_first=True)
yr_df=pd.get_dummies(final_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(final_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(final_df['hour'],prefix='h',drop_first=True)
season_df=pd.get_dummies(final_df['season'],prefix='s',drop_first=True)
day_df=pd.get_dummies(final_df['day'],prefix='d',drop_first=True)

final_df=final_df.drop(['weather','year','month','hour','season','day'], axis=1)

final_df=final_df.join(weather_df)
final_df=final_df.join(yr_df)
final_df=final_df.join(month_df)                     
final_df=final_df.join(hour_df)
final_df=final_df.join(season_df)
final_df=final_df.join(day_df)

In [None]:
print(final_df.columns.to_series().groupby(final_df.dtypes).groups)
final_df.head(5)

In [None]:
#Initializing training set
X=final_df.iloc[:,final_df.columns!='count'].values
Y=final_df.iloc[:,5].values

In [None]:
#Ridge Regression Implementation 10 Folds
# define model
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
#Lasso Regression Implementation 10 Folds
# define model
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
#Decision TreeImplementation 10 Folds
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
dtm = DecisionTreeRegressor(random_state=42)
param_grid = {"criterion": ["mse", "mae"],
              }
search = GridSearchCV(dtm,param_grid, scoring='neg_mean_squared_error', cv=cv)

results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
#Decision Tree with Pruning with 10 Folds
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
dtm = DecisionTreeRegressor(random_state=42)
param_grid = {"criterion": ["mse", "mae"],
              "max_depth": [2, 6, 8],
              }
search = GridSearchCV(dtm,param_grid, scoring='neg_mean_squared_error', cv=cv)

results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
grid = dict()
grid['n_estimators'] = [1000]
grid['max_depth'] = [125,150,175]
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = RandomForestRegressor()

search = GridSearchCV(model,param_grid=grid, scoring='neg_mean_squared_error', cv=cv)
results = search.fit(X, Y)
# summarize
print('MSE (NEGATIVE): %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
def grid_search():
    from sklearn.ensemble import GradientBoostingRegressor
    print ('lets go')

    model = GradientBoostingRegressor()
    # define the grid of values to search
    grid = dict()
    grid['n_estimators'] = [4000]
    grid['learning_rate'] = [ 0.001, 0.01, 0.1]
    grid['max_depth'] = [4]

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the grid search procedure
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error')
    grid_search=grid_search.fit(X,Y)
    best_accuracy=grid_search.best_score_
    best_parameters=grid_search.best_params_
    print (best_accuracy)
    print (best_parameters)
    
grid_search()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
rgr=GradientBoostingRegressor(learning_rate=0.1,n_estimators=4000, max_depth=4)
rgr.fit(X,Y)

In [None]:
test_df=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
test_df['day']=test_df['datetime'].apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d %H:%M:%S").weekday()])
test_df['datetime']=pd.to_datetime(test_df['datetime'], format='%Y-%m-%d %H:%M:%S')
test_df['month']=test_df['datetime'].apply(lambda x:x.month)
test_df['hour']=test_df['datetime'].apply(lambda x:x.hour)
test_df['year']=test_df['datetime'].apply(lambda x:x.year)
test_df=test_df.drop(['datetime','temp'], axis=1)

#adding dummy varibles to categorical variables
weather_df=pd.get_dummies(test_df['weather'],prefix='w',drop_first=True)
yr_df=pd.get_dummies(test_df['year'],prefix='y',drop_first=True)
month_df=pd.get_dummies(test_df['month'],prefix='m',drop_first=True)
hour_df=pd.get_dummies(test_df['hour'],prefix='h',drop_first=True)
season_df=pd.get_dummies(test_df['season'],prefix='s',drop_first=True)
day_df=pd.get_dummies(test_df['day'],prefix='d',drop_first=True)

test_df=test_df.drop(['weather','year','month','hour','season','day'], axis=1)

test_df=test_df.join(weather_df)
test_df=test_df.join(yr_df)
test_df=test_df.join(month_df)                     
test_df=test_df.join(hour_df)
test_df=test_df.join(season_df)
test_df=test_df.join(day_df)

In [None]:
temp=pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
X_test=test_df.iloc[:,:].values
y_output=rgr.predict(X_test)
y_output
op=pd.DataFrame({'count':np.exp(y_output)})
op['datetime']=temp['datetime']
op.to_csv('finalSubmission.csv', index=False)