In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from  xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
import time

In [None]:
df=pd.read_csv('../input/bike-sharing-demand/train.csv')
test_data=pd.read_csv('../input/bike-sharing-demand/test.csv')
df

In [None]:
df.info()

***No missing Data***

# Datetime handling

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'], 
 format = '%Y-%m-%dT%H:%M:%S', 
 errors = 'coerce')
df['Year'] = df['datetime'].dt.year
df['Month'] = df['datetime'].dt.month
df['Day'] = df['datetime'].dt.day
df['Hour'] = df['datetime'].dt.hour

df

### Adding WeekDays

In [None]:
df['Weekday']=pd.DatetimeIndex(df['datetime']).day_name()
df

In [None]:
df.drop('datetime',axis=1,inplace=True)

In [None]:
df

In [None]:
df['Hour'].value_counts()

### Data Encoding

In [None]:
df['Weekday']=pd.factorize(df['Weekday'])[0].reshape(-1, 1)
df.info()

In [None]:
c=[]
for i in df['Hour']:
    
    if i>= 6 or i<= 18 :
        c.append("Day")
    else:
        c.append("Night")
df['DayorNight']=c
df['DayorNight']=pd.factorize(df['DayorNight'])[0].reshape(-1, 1)
c=[]
df

# Correlation

In [None]:

cor_mat= df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

#### working Day have high correlation with the 'weekday','registered','casual' so I will drop Them

In [None]:
df.drop(['Weekday','registered','casual'],axis=1,inplace=True)

# Data Spliting

In [None]:
Y=np.log1p(df['count'])
plt.figure(figsize=(10, 8))
sns.histplot(Y);
plt.title("Bike Count");

In [None]:
X=df.drop('count',axis=1)
X

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,Y, train_size=0.8, test_size=0.2,random_state=0)
X_train.info()

# Grid Search

In [None]:
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),LinearRegression(),DecisionTreeRegressor(),XGBRegressor(),ExtraTreesRegressor(),GradientBoostingRegressor()]
model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','SVR','LinearRegression','DecisionTreeRegressor','XGBRegressor','ExtraTreesRegressor','GradientBoostingRegressor']
rmsle=[]
d={}

for model in range (len(models)):
    clf=models[model]
    clf.fit(X_train,y_train)
    print("model_name : ",model_names[model])
    print(clf.get_params())
    test_pred=clf.predict(X_valid)
    #print(test_pred)
    rmsle.append(np.sqrt(mean_squared_log_error(abs(test_pred),(y_valid))))
    
d={'Modelling Algo':model_names,'RMSLE':rmsle}   
d

In [None]:
rmsle_frame=pd.DataFrame(d)
rmsle_frame

In [None]:
sns.factorplot(y='Modelling Algo',x='RMSLE',data=rmsle_frame,kind='bar',size=5,aspect=2)

### The best model is Bagging Regressor or XGboost

# Building the Model

In [None]:
def rmsle(y_pred,y_true):
    log1=np.log(y_pred + 1)
    log2=np.log(y_true + 1)
    se = (log1 - log2) ** 2 
    mse=np.mean(se)
    return np.sqrt(mse)

from sklearn.metrics import make_scorer
myScorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
param_grid =  {'n_estimators': [50,100,150,300,500,40]}
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

best = GridSearchCV(estimator=BaggingRegressor(DecisionTreeRegressor(min_samples_split = 10)), param_grid=param_grid, scoring=myScorer, cv=5, n_jobs=None,verbose=0)
best.fit(X, Y)
print(-best.best_score_,best.best_estimator_,best.best_params_)

# Testing

In [None]:
test_data.info()

In [None]:
test_data['datetime'] = pd.to_datetime(test_data['datetime'], 
 format = '%Y-%m-%dT%H:%M:%S', 
 errors = 'coerce')
test_data['Year'] = test_data['datetime'].dt.year
test_data['Month'] = test_data['datetime'].dt.month
test_data['Day'] = test_data['datetime'].dt.day
test_data['Hour'] = test_data['datetime'].dt.hour

In [None]:
c=[]
for i in test_data['Hour']:
    
    if i>= 6 or i<= 18 :
        c.append("Day")
    else:
        c.append("Night")
test_data['DayorNight']=c
test_data['DayorNight']=pd.factorize(test_data['DayorNight'])[0].reshape(-1, 1)
test_data

In [None]:

df_copy=test_data.drop(['datetime'],axis=1)

pred=np.round(np.expm1(best.predict(df_copy))).astype(int)
#df_copy

# Submission File Generation

In [None]:
output = pd.DataFrame({'datetime':test_data.datetime,
                       'count': pred})
output.to_csv('submission.csv', index=False)

print("Your submission was successfully saved!")