**Objective** - To share learnings on how to implement Forecasting using XGBoost Algorithm 

**Audience**: Any one who would like to learn simple prediction using XG Boost can refer this notebook. 

In [None]:
##List of Libraries which we will be using in this exercise

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv",parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv',parse_dates=['datetime'],index_col=0)

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
def add_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['hour'] = df.index.hour

In [None]:
add_features(df)
add_features(df_test)
df.head()

In [None]:
plt.figure(figsize=(16,5))
df['count'].plot()
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title('Rental Count - Gaps')
df['2012-02':'2012-04']['count'].plot()
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title('Rental Count - Hourly Trend')
df['2011-01-01']['count'].plot()
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title('Jan 2011 Rentals (1 month)')
df['2011-01']['count'].plot()
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.plot(df['2011']['count'],label='2011')
plt.plot(df['2012']['count'],label='2012')
plt.title('2011 and 2012 Rentals (Year to Year)')
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.legend()
plt.show()

In [None]:
plt.scatter(x=df.temp,y=df["count"])
plt.grid(True)
plt.xlabel('Temperature')
plt.ylabel('Count')
plt.title('Temperature vs Count')
plt.show()

In [None]:
    plt.figure(figsize=(16,5))
    df.corr()['count'].plot(kind='bar')
    plt.show()

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.loc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = rows-train
rows, train, test

In [None]:
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

In [None]:
# Prepare Training Set & Validation
df_train=df.iloc[:train].reset_index()
df_validation=df.iloc[train:].reset_index()


In [None]:
X_train = df_train[['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']]
y_train= df_train[['count']]
X_validation = df_validation[['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']]
y_validation = df_validation[['count']]

In [None]:
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
#regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150)
regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)

In [None]:
regressor

In [None]:
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

In [None]:
eval_result = regressor.evals_result()
training_rounds = range(len(eval_result['validation_0']['rmse']))
print(training_rounds)

In [None]:
plt.figure(figsize=(8,4))
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
xgb.plot_importance(regressor)
plt.show()

In [None]:

X_test=df_validation[['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']]
y_test=df_validation['count']

In [None]:
result = regressor.predict(X_test)

In [None]:
df_validation['count_predicted']=result

In [None]:
df_validation.head()

In [None]:
df_validation['count_predicted'].describe()

In [None]:
df_validation[df_validation['count_predicted'] < 0].head()

In [None]:
df_validation['count_predicted'].hist()
plt.title('Predicted Count Histogram')
plt.show()

In [None]:
def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

In [None]:
df_validation['count_predicted'] = df_validation['count_predicted'].map(adjust_count)

In [None]:
df_validation[df_validation['count_predicted'] < 0]

In [None]:
# Actual Vs Predicted
plt.plot(df_validation['count'], label='Actual')
plt.plot(df_validation['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df_validation['count'] - df_validation['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

In [None]:
value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:0.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:0.2f}'.format(value_counts[False]/len(residuals)))

In [None]:
print("RMSE: {0:0.2f}".format(mean_squared_error(df_validation['count'],df_validation['count_predicted'])**.5))

In [None]:
# Reference:Katerina Malahova, Khor SoonHin 
# https://www.slideshare.net/KhorSoonHin/rmsle-cost-function
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)
        
    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)
     
    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)

In [None]:
print("RMSLE: {0}".format(compute_rmsle(df_validation['count'],df_validation['count_predicted'])))

In [None]:
df_test=df_test.reset_index()
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
X_test.head()

In [None]:
result = regressor.predict(X_test)

In [None]:
result[:5]

In [None]:
df_test["count"] = result

In [None]:
df_test[df_test["count"]<0]

In [None]:
df_test["count"]=df_test["count"].map(adjust_count)

In [None]:
df_test[['datetime','count']].to_csv('predicted_count.csv',index=False)

Thank You! 