## Importing Neccessary Libraries

In [None]:
import pandas as pd
import pickle
import numpy as np
import re
from sklearn import linear_model


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn import ensemble 
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.model_selection import train_test_split


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

import os
import warnings
warnings.filterwarnings('ignore') 
from pandas_profiling import ProfileReport

## Understanding the Data Set

In [None]:
train=pd.read_csv('../input/bike-sharing-demand/train.csv', parse_dates=['datetime'])
test=pd.read_csv('../input/bike-sharing-demand/test.csv', parse_dates=['datetime'])

In [None]:
print(train.shape)
print(train.head())
print(train.dtypes)

In [None]:
profile = ProfileReport(train, title="Pandas Profiling Report")
profile

In [None]:
# Export html report
profile.to_file("train_profiling_report.html")

In [None]:
print(test.shape)
print(test.columns)
print([i for i in train.columns if i not in test.columns]) 

In [None]:
train = train.drop(['casual','registered'],axis=1)
train.head()

In [None]:
def rmsle(y_log, y0_log):
    return np.sqrt(np.mean(np.square(y_log - y0_log)))

## EDA & Feature Engineering

In [None]:
# Remove Outliers
train = train[np.abs(train["count"]-train["count"].mean())<=(3*train["count"].std())] 

In [None]:
for df in (train,test):
    
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hr'] = df['datetime'].dt.hour

    df['hr_categori'] = np.nan
    df.loc[(df.workingday == 1)&(df.hr <= 9)&(df.hr >= 7), 'hr_categori'] = "rush"
    df.loc[(df.workingday == 1)&(df.hr <= 19)&(df.hr >= 16), 'hr_categori'] = "rush"
    df.loc[(df.workingday == 1)&(df.hr < 16)&(df.hr > 9), 'hr_categori'] = "day"
    df.loc[(df.workingday == 1)&(df.hr < 7)|(df.hr > 19), 'hr_categori'] = "night"
  

#### Hour

In [None]:
sns.factorplot(x="hr",y="count",data=train,kind='bar',size=5,aspect=1.5)

#### Month

In [None]:
sns.factorplot(x="month",y="count",data=train,kind='bar',size=5,aspect=1.5)

#### Year

In [None]:
sns.factorplot(x="year",y="count",data=train,kind='bar',size=5,aspect=1.5)

#### Temp

In [None]:
new_df=train.copy()

new_df['temp_bin']=np.floor(new_df['temp'])//3

sns.factorplot(x="temp_bin",y="count",data=new_df,kind='bar')

#### Humidity

In [None]:
new_df=train.copy()
new_df['humidity_bin']=np.floor(new_df['humidity'])//5

sns.factorplot(x="humidity_bin",y="count",data=new_df,kind='bar')

In [None]:
for df in (train,test):
    df['humi_categori'] = np.nan
    df.loc[df.humidity < 15, 'humi_categori'] = 'low'
    df.loc[(df.humidity <= 85)&(df.humidity >= 15), 'humi_categori'] = 'normal'
    df.loc[(df.humidity >= 85), 'humi_categori'] = 'high'

#### Windspeed

In [None]:
train[train['windspeed']>=48]

In [None]:
new_df=train.copy()

new_df['windspeed_bin']=np.floor(new_df['windspeed'])//3
sns.factorplot(x="windspeed_bin",y="count",data=new_df,kind='bar')

In [None]:
for df in (train,test):
    df['wind_categori'] = np.nan
    df.loc[df.windspeed < 15, 'wind_categori'] = 'normal'
    df.loc[(df.windspeed >= 15), 'wind_categori'] = 'high'

#### Convert categorical variable into dummy variables

In [None]:
weather_train = pd.get_dummies(train['weather'], prefix='wea', prefix_sep='_')
weather_test = pd.get_dummies(test['weather'], prefix='wea', prefix_sep='_')

season_train = pd.get_dummies(train['season'], prefix='sea', prefix_sep='_')
season_test = pd.get_dummies(test['season'], prefix='sea', prefix_sep='_')

year_train = pd.get_dummies(train['year'], prefix='year', prefix_sep='_')
year_test = pd.get_dummies(test['year'], prefix='year', prefix_sep='_')

wind_categori_train = pd.get_dummies(train['wind_categori'], prefix='wind_cate', prefix_sep='_')
wind_categori_test = pd.get_dummies(test['wind_categori'], prefix='wind_cate', prefix_sep='_')

humi_categori_train = pd.get_dummies(train['humi_categori'], prefix='humi_cate', prefix_sep='_')
humi_categori_test = pd.get_dummies(test['humi_categori'], prefix='humi_cate', prefix_sep='_')

hr_categori_train = pd.get_dummies(train['hr_categori'], prefix='hr_cate', prefix_sep='_')
hr_categori_test = pd.get_dummies(test['hr_categori'], prefix='hr_cate', prefix_sep='_')

#### Train/Test Set Integration

In [None]:
Train_Master = pd.concat([train,
                          weather_train,
                          season_train,
                          year_train,
                          wind_categori_train,
                          humi_categori_train,
                          hr_categori_train
                         ], axis=1)

In [None]:
Test_Master = pd.concat([test,
                          weather_test,
                          season_test,
                          year_test,
                          wind_categori_test,
                          humi_categori_test,
                          hr_categori_test
                         ], axis=1)

In [None]:
Train_Master = Train_Master.drop(['weather','season','year','datetime','hr_categori','humi_categori','wind_categori'], axis = 1)
Test_Master = Test_Master.drop(['weather','season','year','datetime','hr_categori','humi_categori','wind_categori'], axis = 1)

Train, Test = train_test_split(Train_Master, test_size = 0.3)

In [None]:
X_train = Train.drop(['count'], axis=1)
Y_train = Train['count']
Y_train_log = Train['count'].apply(lambda x:np.log1p(x))

X_test = Test.drop(['count'], axis=1)
Y_test = Test["count"]
Y_test_log = Test["count"].apply(lambda x:np.log1p(x))

Y_test = Y_test.reset_index().drop('index',axis = 1)
Y_train = Y_train.reset_index().drop('index',axis = 1)

In [None]:
print(Train_Master.columns, Train_Master.shape)
print(Test_Master.columns, Test_Master.shape)
print([i for i in Train_Master.columns if i not in Test_Master.columns]) 

## Auto Tuning

In [None]:
def lasso_autotune(X,y):

    param_test = {
     'max_iter':[5000],
     'alpha':[1e-10, 1e-8, 1e-4,1e-2, 1, 5, 10, 100, 500, 1000]
    }

    gsearch = GridSearchCV(Lasso(fit_intercept=True, normalize=True, selection='random'),refit=True,
                            param_grid = param_test, scoring='neg_mean_squared_error',iid=True, cv=5)
    gsearch.fit(X,y)
    print('____________________________________________')
    print('tune lasso')
    print('____________________________________________')
    print(gsearch.cv_results_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
    print(rmsle(gsearch.predict(X),y))

    return gsearch.best_estimator_

In [None]:
def ridge_autotune(X,y):

    param_test = {
     'max_iter':[5000],        
     'alpha':[1e-10, 1e-8, 1e-4,1e-2, 1, 5, 10, 100],
     'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }

    gsearch = GridSearchCV(Ridge(),refit=True,
                            param_grid = param_test, scoring='neg_mean_squared_error',iid=True, cv=5)
    gsearch.fit(X,y)
    print('____________________________________________')
    print('tune ridge')
    print('____________________________________________')
    print(gsearch.cv_results_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
    print(rmsle(gsearch.predict(X),y))

    return gsearch.best_estimator_

In [None]:
def random_forest_autotune(X,y):

    param_test = {
     'max_depth':[9,13,17],
     'max_features':['auto','sqrt'],
     'n_estimators': [500,1000,1500]
    }

    gsearch = GridSearchCV(RandomForestRegressor(),refit=True,
                            param_grid = param_test, scoring='neg_mean_squared_error',iid=True, cv=5)
    gsearch.fit(X,y)
    print('____________________________________________')
    print('tune random forest')
    print('____________________________________________')
    print(gsearch.cv_results_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
    print(rmsle(gsearch.predict(X),y))

    return gsearch.best_estimator_

In [None]:
def xgb_autotune(X,y):
    param = {
            'objective':'reg:squarederror',
            'n_estimators':1000,
            'learning_rate':0.05,
            'gamma':0,
            'max_depth':5,
            'min_child_weight':1,
            'colsample_bytree':0.5,
            'subsample': 0.8, 
            'verbose':1,
            'reg_alpha':0}


    param_test1 = {
     'max_depth':[7,13,15],
     'reg_alpha':[0.1,1,10,100],
     'learning_rate':[0.05,0.1],
    }

    gsearch = GridSearchCV(xgb.XGBRegressor(objective=param['objective'],
                              n_estimators=param['n_estimators'], learning_rate = param['learning_rate'], gamma=param['gamma'],
                              colsample_bytree=param['colsample_bytree'], subsample=param['subsample'], reg_alpha=param['reg_alpha']
                              ),
                            param_grid = param_test1, scoring='neg_mean_squared_error',iid=False, cv=5)
    gsearch.fit(X,y)
    print('____________________________________________')
    print('tune max_depth, min_child_weight')
    print('____________________________________________')
    print(gsearch.cv_results_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)
    print(rmsle(gsearch.predict(X),y))
    return gsearch.best_estimator_

## Fitting 1st layer models

In [None]:
lasso_param = lasso_autotune(X_train,Y_train_log)
ridge_param = ridge_autotune(X_train,Y_train_log)
xgb_param = xgb_autotune(X_train,Y_train_log)
random_forest_param = random_forest_autotune(X_train,Y_train_log)

In [None]:
model_names=['lasso','ridge','xgb','random forest']
rmsles=[]

print('='*15)
print('lasso test score')
rmsles.append(rmsle(lasso_param.predict(X_test),Y_test_log))
print(rmsle(lasso_param.predict(X_test),Y_test_log))
print('='*15)

print('='*15)
print('ridge test score')
rmsles.append(rmsle(ridge_param.predict(X_test),Y_test_log))
print(rmsle(ridge_param.predict(X_test),Y_test_log))
print('='*15)

print('='*15)
print('xgb test score')
rmsles.append(rmsle(xgb_param.predict(X_test),Y_test_log))
print(rmsle(xgb_param.predict(X_test),Y_test_log))
print('='*15)

print('='*15)
print('random forest test score')
rmsles.append(rmsle(random_forest_param.predict(X_test),Y_test_log))
print(rmsle(random_forest_param.predict(X_test),Y_test_log))
print('='*15)

d={'Modelling Algo':model_names,'RMSLE':rmsle} 
print(d)

## Fitting 2nd layer model

In [None]:
predict_met = pd.DataFrame(data={method: [np.nan] * X_train.shape[0] for method in ['lasso', 'ridge', 
                                                                                  'xgb', 'random forest',
                                                                                   ]})

In [None]:
predict_met['lasso'] = lasso_param.predict(X_train)
predict_met['ridge'] = ridge_param.predict(X_train)
predict_met['xgb'] = xgb_param.predict(X_train)
predict_met['random forest'] = random_forest_param.predict(X_train)

predict_met.head()

In [None]:
ensenble_xgb = xgb_autotune(predict_met, Y_train_log)

In [None]:
test_met = pd.DataFrame(data={method: [np.nan] * X_test.shape[0] for method in ['lasso', 'ridge', 
                                                                                  'xgb', 'random forest',
                                                                               ]})

test_met['lasso'] = lasso_param.predict(X_test)
test_met['ridge'] = ridge_param.predict(X_test)
test_met['xgb'] = xgb_param.predict(X_test)
test_met['random forest'] = random_forest_param.predict(X_test)

In [None]:
print('='*15)
print('outer layer test score')
print(rmsle(ensenble_xgb.predict(test_met),Y_test_log))
print('='*15)

## Fitting Test Set

In [None]:
Test_Master['humi_cate_low'] = 0
Test_Master = Test_Master[X_train.columns]

In [None]:
result_met = pd.DataFrame(data={method: [np.nan] * Test_Master.shape[0] for method in ['lasso', 'ridge', 
                                                                                  'xgb', 'random forest',
                                                                                      ]})
result_met['lasso'] = lasso_param.predict(Test_Master)
result_met['ridge'] = ridge_param.predict(Test_Master)
result_met['xgb'] = xgb_param.predict(Test_Master)
result_met['random forest'] = random_forest_param.predict(Test_Master)
                        
result_met.head()                     

In [None]:
pred_log = ensenble_xgb.predict(result_met)

## Submit to competiton

In [None]:
pred = np.expm1(pred_log)

submission={'datetime':test['datetime'],'count':pred}
submission = pd.DataFrame(submission)

submission['count'] = submission.apply(lambda x : 1 if (x['count'] <= 0) else x['count'], axis = 1)
submission.to_csv("submission.csv", index=False)

In [None]:
submission