# Importing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sklearn
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
datapath = "../input/bike-sharing-demand"
train_df = pd.read_csv(os.path.join(datapath,"train.csv"))
test_df = pd.read_csv(os.path.join(datapath,"test.csv"))
date=test_df.datetime

# Just change a little in date data

In [None]:
# parse datetime colum & add new time related columns
train_df['datetime']=pd.to_datetime(train_df['datetime'])

train_df['day'] = train_df['datetime'].dt.day_name()
train_df['month'] = train_df['datetime'].dt.month_name()
train_df['year'] = train_df['datetime'].dt.year
train_df['hour'] = train_df['datetime'].dt.hour
train_df['dayofweek'] = train_df['datetime'].dt.dayofweek
train_df['weekofyear'] = train_df['datetime'].dt.weekofyear

train_df = train_df.drop('datetime',axis=1)

In [None]:
train_df.head(5)

# EDA

In [None]:
display(train_df.describe().T)

In [None]:
display(train_df.info())

In [None]:
sns.pairplot(train_df[["season","weather","temp","atemp","humidity","windspeed","count"]], diag_kind="kde")

In [None]:
#duplicates Checking
train_df.duplicated().sum()

In [None]:
weather_df = train_df[["season","weather"]]
date_df =train_df [['day','month','year','hour','dayofweek','weekofyear']]

In [None]:
j = 0
plt.figure(figsize=(25, 25))
for i in weather_df: 
    j=j+1
    plt.subplot(7, 4, j+1)
    sns.barplot(weather_df[i],train_df["count"])
    plt.show

In [None]:
j = 0
plt.figure(figsize=(25, 10))
for i in date_df: 
    j=j+1
    plt.subplot(2, 4, j+1)
    sns.barplot(date_df[i],train_df["count"])
    plt.show


# Working on data (Feature Engineering )

In [None]:
train_df["temp"].describe()

In [None]:
train_df["temp_range"] = train_df["temp"]
for indx, i in enumerate(list(train_df["temp"])):
    if i <=10:
        train_df["temp_range"][indx] = 0
    elif 10 < i <= 18 :
        train_df["temp_range"][indx] = 2
    elif 18 < i <= 25 :
        train_df["temp_range"][indx] = 3
    elif 25 < i <= 32 :
        train_df["temp_range"][indx] = 4
    elif 32 < i :
        train_df["temp_range"][indx] = 1

In [None]:
train_df[['count','temp_range']].groupby(train_df["temp_range"]).sum()

In [None]:
# some new column
train_df['RushHour']= train_df['hour'].isin([8,17,18,19,20,21])
train_df['lowHour']= train_df['hour'].isin([0,1,2,3,4])
train_df['DayorNight'] = (train_df['hour'] >= 7) & (train_df['hour'] <= 20)

In [None]:
display(train_df.info())

In [None]:
train_df["day"] = train_df["day"].astype('category').cat.codes
train_df["month"] = train_df["month"].astype('category').cat.codes

In [None]:
train_df.head(10)

In [None]:
corr_martix = train_df.corr()
plt.subplots(figsize=(15,10))
plt.title('Correlation between Features', size=18)
sns.heatmap(corr_martix,linewidths=0.01,cmap="Blues" , annot=True)
plt.show()

# Drop featuers

In [None]:
Y  = train_df['count']
Y1 = train_df['casual']
Y2 = train_df['registered']

In [None]:
train_df = train_df.drop(["year","atemp","count","casual","registered"],axis = 1)

# Test data

In [None]:
# parse datetime colum & add new time related columns
test_df['datetime']=pd.to_datetime(test_df['datetime'])
test_df['day'] = test_df['datetime'].dt.day_name()
test_df['month'] = test_df['datetime'].dt.month_name()
test_df['year'] = test_df['datetime'].dt.year
test_df['hour'] = test_df['datetime'].dt.hour
test_df['dayofweek'] = test_df['datetime'].dt.dayofweek
test_df['weekofyear'] = test_df['datetime'].dt.weekofyear
test_df = test_df.drop('datetime',axis=1)
############################################################
test_df["temp_range"] = test_df["temp"]
for indx, i in enumerate(list(test_df["temp"])):
    if i <=10:
        test_df["temp_range"][indx] = 0
    elif 10 < i <= 18 :
        test_df["temp_range"][indx] = 2
    elif 18 < i <= 25 :
        test_df["temp_range"][indx] = 3
    elif 25 < i <= 32 :
        test_df["temp_range"][indx] = 4
    elif 32 < i :
        test_df["temp_range"][indx] = 1
############################################################
# some new column
test_df['RushHour']= test_df['hour'].isin([8,17,18,19,20,21])
test_df['lowHour']= test_df['hour'].isin([0,1,2,3,4])
test_df['DayorNight'] = (test_df['hour'] >= 7) & (test_df['hour'] <= 20)
############################################################
test_df["day"] = test_df["day"].astype('category').cat.codes
test_df["month"] = test_df["month"].astype('category').cat.codes
############################################################
test_df = test_df.drop(["year","atemp"],axis = 1)

In [None]:
print(np.shape(train_df),np.shape(test_df))

# Model

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_df , Y , test_size = 0.05, random_state = 29)

In [None]:
#evaluation matrix
from math import sqrt
def rmsle(y_pred , y_actual):
    n = y_pred.size 
    RMSLE = sqrt(((np.log(y_pred+1)-np.log(y_actual+1))**2).sum()/n)
    return RMSLE

from sklearn.metrics import make_scorer
myScorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

from sklearn.preprocessing import MaxAbsScaler,PowerTransformer,MinMaxScaler,RobustScaler

from xgboost import XGBRegressor

from scipy import stats

In [None]:
HistGradient = HistGradientBoostingRegressor()

param = {#n_estimators' : [180], 
    'max_iter':[115],
    'max_depth' : [11],
    'max_leaf_nodes':[15],
    'max_bins':[150]
         #min_samples_split':[2],
         #min_samples_leaf':[1],
        }
gridSearch_HistGradient = GridSearchCV(HistGradient,param,scoring=myScorer,cv=10,verbose=3)
gridSearch_HistGradient.fit(x_train,np.log1p(y_train))

best_HistGradient = gridSearch_HistGradient.best_estimator_
bestHistGradient_testScore=best_HistGradient.score(x_train, np.log1p(y_train))

In [None]:
gridSearch_HistGradient.best_params_

In [None]:
bestHistGradient_testScore

In [None]:
pred=np.expm1(best_HistGradient.predict(x_val))

In [None]:
print(rmsle(pred,y_val))

# submission

In [None]:
test_df

In [None]:
predictions = np.expm1(best_HistGradient.predict(test_df))

In [None]:
predictions = pd.DataFrame({'datetime':date,
                       'count': predictions})

In [None]:
predictions.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
predictions.head(10)