In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Packages for Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Read train data set

In [None]:
datasetpath = '/kaggle/input/bike-sharing-demand/'

df = pd.read_csv(os.path.join(datasetpath, 'train.csv'))

print("The shape of the dataset is {}.".format(df.shape))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
df["datetime"] = pd.to_datetime(df.datetime)
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['DayName'] = df['datetime'].dt.day_name()
df.drop(columns='datetime', inplace=True)
df.head()

Remove outlier in humidity(%) equal zero. <br>
[The concept of zero percent relative humidity — air completely devoid of water vapor — is intriguing, but given Earth’s climate and weather conditions, it’s an impossibility.](https://wgntv.com/weather/can-the-relative-humidity-ever-be-zero-percent/)

In [None]:
#there is an outlier in the humidity feature as there cant be a humidity of 0%
humidity_filter = df['humidity'] == 0
print("sum of rows where humidity equals zero:", humidity_filter.sum())
df.drop(index = df.loc[humidity_filter].index, inplace=True)

In [None]:
lst_objects = ['season', 'holiday', 'workingday', 'weather', 'DayName']

In [None]:
fig, axes=plt.subplots(nrows=15, ncols=1, figsize=(10,80))
col=0 # to avoid columns of ID and y
k=0
for i in range(16):
    if df.columns[col] == 'count':
        k = 1
        pass
    elif df.columns[col] in lst_objects:
        #sns.histplot(data=df, x='y', hue=df.columns[col], ax=axes[i], kde=True)
        sns.boxplot(data=df, x=df.columns[col], y='count', ax=axes[i-k])
    elif df.columns[col] in ['year', 'month', 'day', 'hour']:
        sns.boxplot(data=df, x=df.columns[col], y='count', ax=axes[i-k])
    else:
        sns.scatterplot(data=df, x=df.columns[col], y='count', ax=axes[i-k])
    col+=1

Add Parts of the Day as feature (Morning [5 am to 12 pm], Afternoon [12 pm to 5 pm], Evening [5 pm to 9 pm], Night [9 pm to 4 am]). <br>
[source](https://www.learnersdictionary.com/qa/parts-of-the-day-early-morning-late-morning-etc)

In [None]:
df['Morning'] = np.where((5 <= df.hour) & (df.hour <= 12), 1, 0)
df['Afternoon'] = np.where((12 < df.hour) & (df.hour <= 17), 1, 0)
df['Evening'] = np.where((17 < df.hour) & (df.hour <= 21), 1, 0)
df['Night'] = np.where((21 < df.hour) | (df.hour < 5 ), 1, 0)
df

In [None]:
encoded_col = pd.get_dummies(df['DayName'])
df = df.join(encoded_col).drop(columns='DayName')

In [None]:
corr = df.corr().abs()
corr_arr = corr.values
plt.subplots(figsize=(20,20))
sns.heatmap(corr_arr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap="Blues");

In [None]:
plt.subplots(figsize=(10,10))
plt.subplot(221)
sns.histplot(df, x='temp');
plt.subplot(222)
sns.histplot(df, x='atemp');
plt.subplot(223)
sns.histplot(df, x='humidity');
plt.subplot(224)
sns.histplot(df, x='windspeed');

In [None]:
df

### Splitting data for training

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_df.drop(columns=['casual','registered','count'])
y_train = np.log(train_df['count']+1)

X_val = val_df.drop(columns=['casual','registered','count'])
y_val = np.log(val_df['count']+1)

# all the data to use in cross validation (kfolds, or any other splits) 
# https://neptune.ai/blog/cross-validation-in-machine-learning-how-to-do-it-right
X = df.drop(columns=['casual','registered','count'])
y = np.log(df['count'] + 1)

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import time

models = [XGBRegressor(), RandomForestRegressor(), ExtraTreesRegressor(), AdaBoostRegressor(),BaggingRegressor(),
          SVR(), KNeighborsRegressor(), LinearRegression(), BayesianRidge(), DecisionTreeRegressor()]
model_names = ['XgboostReg','RandomForestReg','ExtraTressReg','AdaBoostReg','BaggingReg',
               'SVR','KNeighborsReg', 'LinearReg', 'BayesianRidge', 'DescisionTreeReg']
rmsle = []
time_lst =[]
d={}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for model in range (len(models)):
    clf = models[model]
    
    #using kfolds for cv of the model
    print("Start fitting {}".format(model_names[model]))
    start = time.time()
    
    score = cross_val_score(clf, X, y, scoring="neg_root_mean_squared_error", cv=kfold)
    
    elp_time = time.time() - start
    time_lst.append(elp_time)
    print("end fitting {} in {} sec\n".format(model_names[model], elp_time))
    rmsle.append(abs(score.mean()))
    
d = {'Modelling Algorithms':model_names, 'RMSLE':rmsle, 'Time(s)':time_lst}

rmsle_frame=pd.DataFrame(d)
rmsle_frame.sort_values(by='RMSLE')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

model = XGBRegressor(tree_method = 'gpu_hist')
n_estimators = list(range(50, 300, 50))
max_depth = [3,5,7,9]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
print(param_grid)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(model, param_grid, scoring="neg_root_mean_squared_error",
                           cv=kfold, n_jobs=-1, verbose=1, return_train_score=True)

grid_result = grid_search.fit(X, y)

print("Best: %f using %s" % (abs(grid_result.best_score_),grid_result.best_params_))
    
results = pd.DataFrame(grid_search.cv_results_)
#results

In [None]:
print("reviewing test and train score.")
results[results["params"]==grid_result.best_params_].loc[:,["mean_test_score","std_test_score","mean_train_score" ,"std_train_score"]].head()

### Evaluting over test data

In [None]:
datasetpath = '/kaggle/input/bike-sharing-demand/'

df_test = pd.read_csv(os.path.join(datasetpath, 'test.csv'))

print("The shape of the dataset is {}.".format(df_test.shape))

In [None]:
df_test.head()

Check for nulls in test data

In [None]:
df_test.isnull().sum()

Check for if there is humidity(%) equal zero in the test data

In [None]:
#there is an outlier in the humidity feature as there cant be a humidity of 0%
humidity_filter = df_test['humidity'] == 0
print("sum of rows where humidity equals zero:", humidity_filter.sum())

Preprocessing Data

In [None]:
def PreprocessingData(_df):
    df = _df.copy()
    
    # extracting datetime relative information
    df["datetime"] = pd.to_datetime(df.datetime)
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['DayName'] = df['datetime'].dt.day_name()
    
    # extracting Parts of Day feature
    df['Morning'] = np.where((5 <= df.hour) & (df.hour <= 12), 1, 0)
    df['Afternoon'] = np.where((12 < df.hour) & (df.hour <= 17), 1, 0)
    df['Evening'] = np.where((17 < df.hour) & (df.hour <= 21), 1, 0)
    df['Night'] = np.where((21 < df.hour) | (df.hour < 5 ), 1, 0)
    
    #data enconding (one hot encoding) for DayName categroical feature
    encoded_col = pd.get_dummies(df['DayName'])
    df = df.join(encoded_col).drop(columns='DayName')
    
    return df

In [None]:
print(df_test.shape)
df_test = PreprocessingData(df_test)
print(df_test.shape)

In [None]:
df_test

In [None]:
# Drop 'datatime' column from X_test
X_test = df_test.drop(columns=['datetime'])

# Predicting 'count' using grid_search for xgboost
y_test_predicted = grid_search.predict(X_test)

#to return the values of log(y+1) to y -> we perform the operation round(e^(log(y+1)) -1)
df_test['count'] = np.round(np.exp(y_test_predicted) - 1) 

df_test.head()

In [None]:
df_test[['datetime', 'count']].to_csv('submission.csv', index=False)