In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from scipy.stats import skew, kurtosis
import math
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore', category = DeprecationWarning)
%matplotlib inline

Read the csv file with the Train Data for EDA (Explonatory Data Analysis)

In [None]:
bike_data = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')

In [None]:
bike_data.head()

Column Types

In [None]:
bike_data.info()

Check for Missing Values

In [None]:
bike_data.isna().sum(axis = 0)

In [None]:
bike_data['datetime'] = pd.to_datetime(bike_data['datetime'])

In [None]:
bike_data.info()

Get the year (for the honors), month, day, hour from datetime

In [None]:
bike_data['year'] = pd.DatetimeIndex(bike_data['datetime']).year
bike_data['month'] = pd.DatetimeIndex(bike_data['datetime']).month
bike_data['day'] = pd.DatetimeIndex(bike_data['datetime']).day
bike_data['hour'] = pd.DatetimeIndex(bike_data['datetime']).hour

Correlation Matrix for the dataset ( Month and Season is as expected highly correlated so one of them will be dropped for the model. temp and atemp too. Finally, registered and casual is highly correlated with the dependant variable, so we will drop it too as we have leakage)

In [None]:
corrMatrix = bike_data.corr()
plt.figure(figsize = (14,14))
sns.heatmap(corrMatrix, annot = True)
plt.show()

Check for outliers with box-plot in dependant variable

In [None]:
ax = sns.boxplot(data = bike_data, y = 'count', orient = 'v')
ax.set(ylabel = 'Count',title = 'Box Plot On Count')
plt.show()

We remove the rows that are 3 std. deviations away from the mean 

In [None]:
bike_data_No_outliers = bike_data[np.abs(bike_data['count'] - bike_data['count'].mean()) <= (3 * bike_data['count'].std())] 

We got rid of 147 entries

In [None]:
bike_data_No_outliers.shape

Check the distribution of the target value

In [None]:
print(skew(bike_data_No_outliers['count'], bias = False))
print(kurtosis(bike_data_No_outliers['count'], bias = False))

In [None]:
sns.set_theme(style = 'darkgrid')
ax = sns.displot(bike_data_No_outliers, x = 'count', kde = True)
ax.set(xlabel = 'count', ylabel = 'amount')
plt.show()

As we see it is far from normally distributed so we log the values to make the distribution closer to normal

In [None]:
sns.set_theme(style = 'darkgrid')
ax = sns.displot(bike_data_No_outliers, x = np.log(bike_data_No_outliers['count']), kde = True)
ax.set(xlabel = 'count', ylabel = 'amount')
plt.show()

In [None]:
print(skew(np.log(bike_data_No_outliers['count']), bias = False))
print(kurtosis(np.log(bike_data_No_outliers['count']), bias = False))

In [None]:
fig,(ax2, ax3) = plt.subplots(ncols = 2)
fig.set_size_inches(14, 5)
sns.set_theme(style = 'darkgrid')
sns.color_palette('pastel')
ax2 = sns.barplot(data = bike_data_No_outliers, x = 'month', y = 'count', alpha = .6, ax = ax2)
ax2.set(xlabel = 'mounth', ylabel = 'amount', title = 'Rental per month')
ax3 = sns.barplot(data = bike_data_No_outliers, x = 'hour', y = 'count', alpha = .6, ax = ax3)
ax3.set(xlabel = 'hour', ylabel = 'amount', title = 'Rental per hour')
plt.show()

As we see above, most rentals are during the day and the worst month is January

# Time to start building our model

In [None]:
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
test.head()

In [None]:
train.head()

We combine the datasets

In [None]:
bike_data_full = pd.concat((train, test)).reset_index(drop = True)

As we see the registered and casual columns are not present in the test dataset so we will drop them.Even if they were present we would because of the high correlation with the dependant value)

In [None]:
bike_data_full.shape

In [None]:
bike_data_full = bike_data_full.drop(['casual','registered'], axis = 1)

We will drop temp as it is highly correlated with atemp (In my opinion, atemp is more representative than temp) and season as it is correlated with month

In [None]:
bike_data_full = bike_data_full.drop(['temp','season'], axis = 1)

We proceed with the transformations that we made earlier regarding only the train dataset to the whole dataset

In [None]:
bike_data_full['datetime'] = pd.to_datetime(bike_data_full['datetime'])
bike_data_full['year'] = pd.DatetimeIndex(bike_data_full['datetime']).year
bike_data_full['month'] = pd.DatetimeIndex(bike_data_full['datetime']).month
bike_data_full['day'] = pd.DatetimeIndex(bike_data_full['datetime']).day
bike_data_full['hour'] = pd.DatetimeIndex(bike_data_full['datetime']).hour

In [None]:
bike_data_full.shape

We have some columns that have discreet values and we will use get_dummies to assist the training

In [None]:
dummie_features = ['holiday', 'workingday', 'weather', 'day', 'month', 'year', 'hour']
for var in dummie_features:
    bike_data_full[var] = bike_data_full[var].astype('category')

In [None]:
dummies = pd.get_dummies(bike_data_full[dummie_features])
bike_data_full = pd.concat([bike_data_full, dummies], axis = 1)

We drop the dummified columns

In [None]:
bike_data_full = bike_data_full.drop(dummie_features, axis = 1)

In [None]:
#bike_data_full.info()

Reseperate the datasets

In [None]:
train_final = bike_data_full[pd.notnull(bike_data_full['count'])].sort_values(by = ['datetime'])
test_final = bike_data_full[~pd.notnull(bike_data_full['count'])].sort_values(by = ['datetime'])
datetimecol = test['datetime']

In [None]:
print(train_final.shape)
print(test_final.shape)

Now we can drop datetime as well

In [None]:
train_final = train_final.drop(['datetime'], axis = 1)
test_final = test_final.drop(['datetime','count'], axis = 1)

We drop the outliers

In [None]:
train_final = train_final[np.abs(train_final['count'] - train_final['count'].mean()) <= (3 * train_final['count'].std())]

Checking if everything goes as planed

In [None]:
#print(train_final.shape)
#print(test_final.shape)

In [None]:
#test_final.info()

We log the count column after we drop it from the train_final dataset

In [None]:
X = train_final.drop('count', axis = 1)
y = np.log(train_final['count'])

We will do a Randomized grid search for XGBRegressor

In [None]:
params = {
        'min_child_weight': [1, 5, 6, 10],
        'gamma': [0.5, 1, 1.5, 2, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.02, 0.05, 0,1],
        'n_estimators' : [500, 750, 1000, 1500]
        }
xgb = XGBRegressor(nthread = 1)

We will use this timer

In [None]:
def timer(start_time = None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
folds = 5
param_comb = 10

Kfold = KFold(n_splits = folds, random_state = 7, shuffle = True)
random_search = RandomizedSearchCV(estimator = xgb, param_distributions = params, n_iter = param_comb, scoring = 'neg_mean_squared_error', n_jobs = 4, cv = Kfold, verbose = 3, random_state = 7)
start_time = timer(None)
random_search.fit(X, y)
timer(start_time)

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index = False) #uncomment for extracting the results to a csv file

In [None]:
import math
predictions = random_search.predict(X)
print('RMSE : ' + str(math.sqrt(mean_squared_error(predictions, y))))
preds = random_search.predict(test_final)

In [None]:

submission = pd.DataFrame({
        'datetime': datetimecol,
        'count': [max(0, x) for x in np.exp(preds)]
    })
submission.to_csv('submission.csv', index = False)