In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import seaborn as sns
import plotly as py
from datetime import datetime
import calendar
from scipy import stats

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import (mean_squared_error, mean_absolute_error)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
from sklearn.tree import DecisionTreeRegressor # calling model
from sklearn.ensemble import GradientBoostingRegressor

# Regressors considered:
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge

print('Library Loaded')

In [None]:
train = pd.read_csv('../input/cab-booking/train.csv')
train_label = pd.read_csv('../input/cab-booking/train_label.csv', header = None)
test = pd.read_csv('../input/cab-booking/test.csv')
test_label = pd.read_csv('../input/cab-booking/test_label.csv', header = None)

In [None]:
print(train.shape)
print(train_label.shape)
print(test.shape)
print(test_label.shape)

In [None]:
train['TotalBooking']= train_label[0]
test['TotalBooking'] = test_label[0]

In [None]:
train.head()

In [None]:
test.head()

***Feature Engineering and Visualization***

In [None]:
# Creating new columns from date time column
train['date']= train.datetime.apply(lambda x : x.split()[0])
train['hour']= train.datetime.apply(lambda x : x.split()[1].split(":")[0])
train["weekday"]= train.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%m/%d/%Y").weekday()])
train["month"]= train.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%m/%d/%Y").month])

test['date']= test.datetime.apply(lambda x : x.split()[0])
test['hour']= test.datetime.apply(lambda x : x.split()[1].split(":")[0])
test["weekday"]= test.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%m/%d/%Y").weekday()])
test["month"]= test.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%m/%d/%Y").month])


In [None]:
train.head()

In [None]:
#Explore data

sns.pairplot(train[["temp","humidity","windspeed","TotalBooking"]], diag_kind ='kde')

In [None]:
sns.barplot(x = 'weather', y = 'TotalBooking', data = train)

In [None]:
sns.barplot(x = 'weekday', y = 'TotalBooking', data = train)

In [None]:
sns.barplot(x = 'month', y = 'TotalBooking', data = train)

In [None]:
sns.barplot(x = 'season', y = 'TotalBooking', data = train)

In [None]:
 
sns.jointplot(train.hour, train.TotalBooking, kind='scatter')
plt.show()

In [None]:
corrmat = train.corr()
fig = plt.figure(figsize = (15,9))

sns.heatmap(corrmat, cmap ='BrBG', annot = True, vmax = 1, square = True,vmin=-1)
plt.show()

MISSING VALUE TREATMENT

In [None]:
train.isnull().sum()  # no missing value observed

OUTLIERS TREATMENT

In [None]:
train.describe()

In [None]:
train.TotalBooking.hist()

In [None]:
rev_stat=train.TotalBooking.describe()
print(rev_stat)
# calculating interquartile range
iqr=rev_stat['75%']-rev_stat['25%']
upper=rev_stat['75%']+1.5*iqr
lower=rev_stat['25%']-1.5*iqr
print()
print('The upper and lower bounds for suspected outliers are {} and {}'.format(lower,upper))

In [None]:

outliers=train[train.TotalBooking>upper].index.tolist()
train[train.TotalBooking>upper]

In [None]:
#Original data with outliers
train.shape

In [None]:
for idx in outliers:
    train.drop(idx,inplace=True)

In [None]:
# After removing outliers
train.shape

ENCODING

In [None]:
train.dtypes

In [None]:
# Segregating Categorical variables
columns = train.columns

cat_col= [col for col in train.columns if train[col].dtypes=='O']
cat_col

In [None]:
for col in cat_col:
    print("Unique Values in {} - {}".format(col,len(train[col].unique())))
    print()

In [None]:
del(train["datetime"])

In [None]:
del(train["date"])

In [None]:
del(train["atemp"]) # remove to avoid multicollinearity- Temp & atemp are highly correlated

In [None]:
train.head()

In [None]:
train = pd.get_dummies(train)
train.head()

In [None]:
train.shape

SPLITTING X & Y

In [None]:
X = train.drop('TotalBooking', axis =1)
y = train['TotalBooking']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(X_train.shape)
print(X_test.shape)


BUILDING MODEL

In [None]:
#--------------- Decision Tree Model---------------
#1st call the model
DT_model = DecisionTreeRegressor()

#2nd fit the model
DT_model.fit(X_train,y_train)

# predict the model
y_pred_DT = DT_model.predict(X_test)
#print('prediction for Decision Tree:', y_pred_DT)
y_pred_train = DT_model.predict(X_train)
#print('prediction for train:', y_pred_train)


#-------------- RANDOM FOREST----------------------

#1st call the model
RF_model = RandomForestRegressor( random_state = 0)

#2nd fit the model
RF_model.fit(X_train,y_train)

# predict the model
y_pred_RF = RF_model.predict(X_test)
#print('prediction for Random Forest:', y_pred_RF)
y_pred_train = RF_model.predict(X_train)
#print('prediction for train:', y_pred_train)

# XG BOOST----
XG_model = XGBRFRegressor()
XG_model.fit(X_train, y_train)
y_pred_XG = XG_model.predict(X_test)
#print('prediction for XGBoost:', y_pred_XG)

# SVM -----
SVM_model = SVR()
SVM_model.fit(X_train, y_train)
y_pred_SVM = SVM_model.predict(X_test)
#print('prediction for SVM:', y_pred_SVM)
# KNN ----
KNN_model = KNeighborsRegressor()
KNN_model.fit(X_train, y_train)
y_pred_KNN = KNN_model.predict(X_test)
#print('prediction for SVM:', y_pred_KNN)

# Gradient Boosting
GB_model = GradientBoostingRegressor()
GB_model.fit(X_train, y_train)
y_pred_GB = GB_model.predict(X_test)
#print('prediction for GB:', y_pred_GB)


print('Model fit')

PERFORMANCE METRICS

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
print("R squared value for DT :", r2_score(y_test, y_pred_DT))
print("R squared value for RF:", r2_score(y_test, y_pred_RF))
print("R squared value for XG:", r2_score(y_test, y_pred_XG))
print("R squared value for SVM:", r2_score(y_test, y_pred_SVM))
print("R squared value for KNN:", r2_score(y_test, y_pred_KNN))
print("R squared value for GB:", r2_score(y_test, y_pred_GB))


print("MSE for DT :", mean_squared_error(y_test, y_pred_DT))
print("MSE for RF:", mean_squared_error(y_test, y_pred_RF))
print("MSE for XG:", mean_squared_error(y_test, y_pred_XG))
print("MSE for SVM:", mean_squared_error(y_test, y_pred_SVM))
print("MSE for KNN:", mean_squared_error(y_test, y_pred_KNN))
print("MSE for GB:", mean_squared_error(y_test, y_pred_GB))

In [None]:
# RANDOM FOREST-

from sklearn.ensemble import RandomForestRegressor
# 1000 trees, samples creation with replacement(bootsrap = true), n_jobs = -1 full processing of system

RF_reg = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, random_state=0)  

#fit the model
RF_reg = RF_reg.fit(X_train, y_train)

#Predict the model
y_train_pred_RF = RF_reg.predict(X_train)
y_pred_RFR = RF_reg.predict(X_test)

print("R squared:",r2_score(y_test,y_pred_RFR))


GRIDSEARCH CV

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid  
param_grid = {
    'bootstrap': [True],
    'max_depth': [20,30,40,],
    'n_estimators': [300, 500, 1000, 1200]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = RF_reg, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
cvrf_grid = grid_search.best_estimator_

#Predict the model
pred_clf = cvrf_grid.predict(X_train)
y_pred_clf = cvrf_grid.predict(X_test)
print("R squared value for GridSearch :", r2_score(y_test, y_pred_clf))
print("MSE for GridSearch :", mean_squared_error(y_test, y_pred_clf))


In [None]:
# FEATURE IMPORTANCE
feat_importances = pd.Series(cvrf_grid.feature_importances_, index=X_train.columns)
f, ax = plt.subplots(figsize=(10,5))
feat_importances.nlargest(5).plot(kind='barh')

In [None]:
test.head()

In [None]:
del(test["datetime"])

In [None]:
del(test["atemp"])

In [None]:
del(test["date"])

In [None]:
test.shape

In [None]:
test = pd.get_dummies(test)
test.head()

In [None]:
test.shape

In [None]:
# Fitting Grid Search Model to test dataset

New_pred = cvrf_grid.predict(test)
print("R squared for prediction v/s test_label:",r2_score(test_label, New_pred))
print("MSE for prediction v/s test_label :", mean_squared_error(test_label, New_pred))


In [None]:
New_pred = pd.DataFrame(New_pred)
New_pred.columns = ['Predictions']
New_pred

In [None]:
# Join the original test data, test_label and prediction data
test = pd.read_csv('../input/cab-booking/test.csv')
test_label = pd.read_csv('../input/cab-booking/test_label.csv', header = None)
test['TotalBooking'] = test_label[0]

In [None]:
test.shape

In [None]:
Dataset_test = test.join(New_pred)
Dataset_test