Importing libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Preparing dataset

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/data_storm_2/new-set.csv')

In [None]:
y = dataset.iloc[:,20].values
cols = [0,20]   #drop id and checkin
dataset.drop(dataset.columns[cols],axis=1,inplace=True)
x = dataset.iloc[:,:].values

In [None]:
y = np.where(y=="Check-In",1,y)     #encode
y = np.where(y=="Canceled",2,y)
y = np.where(y=="No-Show",3,y)

In [None]:
print(x[0])
print(y)

['F' 40 'Latino' 'Grad' '<25K' 'North' 'City Hotel' '7/1/2015' '7/2/2015'
 '5/21/2015' 2 2 0 'BB' 'No' 'No' 'No Deposit' 'Online' 'Yes' 'Yes' 10 218]
[1 1 1 ... 3 1 1]


*Preprocessing - Date Conversion*

In [None]:
booking = x[:,9]       #copy booking dates
checkout = x[:,8]      #copy check-in dates
checkin = x[:,7]       #copy check-out dates

In [None]:
print(booking)
print(checkout)
print(checkin)

['5/21/2015' '5/26/2015' '6/29/2015' ... '10/19/2016' '6/1/2016'
 '5/23/2016']
['7/2/2015' '7/2/2015' '7/6/2015' ... '11/20/2016' '11/19/2016'
 '11/21/2016']
['7/1/2015' '7/1/2015' '7/2/2015' ... '11/18/2016' '11/18/2016'
 '11/18/2016']


In [None]:
### func and class template to calculate the date difference between two given calendar dates
### source : w3schools | added minor modifications

class Date:
    def __init__(self, d, m, y):
        self.d = d
        self.m = m
        self.y = y
 
monthDays = [31, 28, 31, 30, 31, 30,
             31, 31, 30, 31, 30, 31]
 
def countLeapYears(d):
    years = d.y
    if (d.m <= 2):
        years -= 1
    return int(years / 4) - int(years / 100) + int(years / 400)
 
def getDifference(dt1, dt2): 
    n1 = dt1.y * 365 + dt1.d
    for i in range(0, dt1.m - 1):
        n1 += monthDays[i]
    n1 += countLeapYears(dt1)
    n2 = dt2.y * 365 + dt2.d
    for i in range(0, dt2.m - 1):
        n2 += monthDays[i]
    n2 += countLeapYears(dt2)
    return (n2 - n1)

In [None]:
diff1 = [] #difference in days between booking and checkin
diff2 = [] #difference in days between checkin and checkout
month = [] #month of each booking to capture any seasonal variations

for i in range(len(booking)):
  arr1 = booking[i].split("/")  #booking date
  arr2 = checkin[i].split("/")  #checkin date
  arr3 = checkout[i].split("/")  #checkout date
  month.append(int(arr2[0]))   #stores month of each booking to capture any seasonal variations
  dt1 = Date(int(arr1[1]), int(arr1[0]), int(arr1[2]))   #booking date
  dt2 = Date(int(arr2[1]), int(arr2[0]), int(arr2[2]))   #checkin date
  dt3 = Date(int(arr3[1]), int(arr3[0]), int(arr3[2]))   #checkout date
  diff1.append(getDifference(dt1, dt2))   #difference in days between booking and checkin
  diff2.append(getDifference(dt2, dt3))   #difference in days between checkin and checkout


In [None]:
np_month = np.array(month).reshape(len(booking),1)   #reshaping np obj
np_diff1 = np.array(diff1).reshape(len(booking),1)   #reshaping np obj
np_diff2 = np.array(diff2).reshape(len(booking),1)   #reshaping np obj

In [None]:
x = np.append(x, np_month, axis=1)    #append new column | month -> captures any seasonal variations that might affect bookings
x = np.append(x, np_diff1, axis=1)    #append new column | difference in days between booking and checkin -> how well in advance does the user books
x = np.append(x, np_diff2, axis=1)    #append new column | difference in days between checkin and checkout -> duration of stay planned by the user

In [None]:
print(x[0])

['F' 40 'Latino' 'Grad' '<25K' 'North' 'City Hotel' '7/1/2015' '7/2/2015'
 '5/21/2015' 2 2 0 'BB' 'No' 'No' 'No Deposit' 'Online' 'Yes' 'Yes' 10 218
 7 41 1]


In [None]:
x = np.delete(x,7,axis=1)       #delete checkin date column
x = np.delete(x,7,axis=1)       #delete checkout date column
x = np.delete(x,7,axis=1)       #delete booking date column

In [None]:
print(x[0])

['F' 40 'Latino' 'Grad' '<25K' 'North' 'City Hotel' 2 2 0 'BB' 'No' 'No'
 'No Deposit' 'Online' 'Yes' 'Yes' 10 218 7 41 1]


Preprocessing - One Hot Encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [0,2,3,4,5,6,10,11,12,13,14,15,16,19])], remainder='passthrough')

In [None]:
x = ct.fit_transform(x)     #apply one hot label encoding to categorical data 

In [None]:
print(x[3])
print(x[3][50])

[0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0
 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 25 4 3 0 5 144 12
 1]
25


Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.0001, random_state = 1)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print (x_train)

[[0.0 1.0 1.0 ... 108 179 3]
 [1.0 0.0 0.0 ... 134 54 3]
 [0.0 1.0 0.0 ... 192 195 2]
 ...
 [0.0 1.0 1.0 ... 129 99 2]
 [1.0 0.0 0.0 ... 234 43 1]
 [1.0 0.0 0.0 ... 214 140 2]]


In [None]:
print (x_test)

[[0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
  1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0
  0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 36 2 1 0 25 215
  22 2]
 [1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
  0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
  0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 56 1 1 0 25 232
  67 1]
 [0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0
  0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0
  0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 30 2 2 1 10 230
  10 1]
 [1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
  0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0
  0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30 2 2 1 10 234
  187 2]]


In [None]:
print (y_train)

[3 3 3 ... 1 1 3]


In [None]:
print (y_test)

[1 1 1 1]


Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()                             #apply feature scaling to remove bias from scalar values

In [None]:
x_train[:, 50:] = sc.fit_transform(x_train[:, 50:])    #ignore the label encoded columns
x_test[:, 50:] = sc.transform(x_test[:, 50:])          #use the same scaler

In [None]:
print(x_test[0])

[0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0
 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
 -0.5229314150694613 -0.2836381915102924 -1.0296061758089174
 -0.6109338239669695 1.1164166549436279 0.9115883729845674
 -1.1426906429980788 0.16994969721984118]


In [None]:
print(x_train[0])

[0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0
 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
 0.7846943692404547 -0.2836381915102924 -1.0296061758089174
 -0.6109338239669695 -1.1152655425814606 -1.5270818702810562
 0.8982746794270753 1.1859525916094191]


# **Classification model**

Parameter Tuning

In [None]:
# #parameter tuning
# from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(x_train.astype(int), y_train.astype(int)) 

# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

Check on validation set

In [None]:
## add code here to test on validation

Training the model

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators = 10000, criterion = 'entropy', random_state = 0)
# classifier.fit(x_train.astype(int), y_train.astype(int))

from sklearn.svm import SVC
classifier = SVC(kernel='poly', degree=8)
classifier.fit(x_train.astype(int), y_train.astype(int))

# y_pred = y_pred.reshape(len(y_pred),1)
# y_test = y_test.reshape(len(y_test),1)

# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()
# classifier.fit(x_train.astype(int), y_train.astype(int))

# from lightgbm import LGBMClassifier
# classifier = LGBMClassifier()
# classifier.fit(x_train.astype(int), y_train.astype(int))

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
# # print best parameter after tuning 
# print(grid.best_params_) 
  
# # print how our model looks after hyper-parameter tuning 
# print(grid.best_estimator_) 

In [None]:
# import xgboost
# classifier = xgboost.XGBClassifier()
# classifier.fit(x_train, y_train)

Prediction on train data set

In [None]:
y_pred = classifier.predict(x_test)     #predict on the test set of the training data

In [None]:
# print (np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

Confusion Matrix

In [None]:
# print (y_pred.shape)
# print (y_test.shape)

# y_pred = y_pred.reshape(len(y_pred),1)
# y_test = y_test.reshape(len(y_test),1)

# print (y_pred.shape)
# print (y_test.shape)

In [None]:
correct = 0
for i in range(len(y_pred)):
  if (y_pred[i] == y_test[i]):
    correct +=1
print (correct/len(y_pred))

1.0


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test.astype(int), y_pred.astype(int)))
print(classification_report(y_test.astype(int), y_pred.astype(int)))

[[4]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



## Test on the new dataset

In [None]:
testset = pd.read_csv('/content/drive/MyDrive/data_storm_2/Hotel-A-test.csv')        #test set

In [None]:
z = testset.iloc[:,1:].values        #remove first column

In [None]:
print (z[0])

['F' 52 'Latino' 'Grad' '25K --50K' 'South' 'City Hotel' '11/18/2016'
 '11/19/2016' '10/28/2016' 3 3 0 'HB' 'No' 'No' 'No Deposit' 'Direct'
 'Yes' 'Yes' 10 153]


In [None]:
booking = z[:,9]           #copy bookin dates
checkout = z[:,8]          #copy checkout dates
checkin = z[:,7]           #copy checkin dates

In [None]:
print(booking)
print(checkout)
print(checkin)

['10/28/2016' '8/6/2016' '4/8/2017' ... '8/26/2016' '3/4/2017' '3/29/2017']
['11/19/2016' '11/19/2016' '5/1/2017' ... '3/30/2017' '3/30/2017'
 '3/30/2017']
['11/18/2016' '11/18/2016' '4/28/2017' ... '3/29/2017' '3/29/2017'
 '3/29/2017']


In [None]:
diff1 = [] #stores difference in days between booking and checkin
diff2 = [] #stores difference in days between checkin and checkout
month = [] #stores month of each booking to capture any seasonal variations

for i in range(len(booking)):
  arr1 = booking[i].split("/")  #booking date
  arr2 = checkin[i].split("/")  #checkin date
  arr3 = checkout[i].split("/")  #checkout date
  month.append(int(arr2[0]))   #stores month of each booking to capture any seasonal variations
  dt1 = Date(int(arr1[1]), int(arr1[0]), int(arr1[2]))   #booking date
  dt2 = Date(int(arr2[1]), int(arr2[0]), int(arr2[2]))   #checkin date
  dt3 = Date(int(arr3[1]), int(arr3[0]), int(arr3[2]))   #checkout date
  diff1.append(getDifference(dt1, dt2))   #day difference between booking and checkin
  diff2.append(getDifference(dt2, dt3))   #day difference between checkin and checkout

In [None]:
np_month = np.array(month).reshape(len(booking),1)   #reshaping np obj
np_diff1 = np.array(diff1).reshape(len(booking),1)   #reshaping np obj
np_diff2 = np.array(diff2).reshape(len(booking),1)   #reshaping np obj

In [None]:
z = np.append(z, np_month, axis=1)    #append new column | month of checkin -> to capture seasonal variations
z = np.append(z,np_diff1, axis=1)     #append new column  | duration of prior booking before checkin
z = np.append(z, np_diff2, axis=1)    #append new column | duration of hotel stay i.e. difference between checkin vs checkout

In [None]:
z = np.delete(z,7,axis=1)       #delete checkin column
z = np.delete(z,7,axis=1)       #delete checkout column
z = np.delete(z,7,axis=1)       #delete bookin column

In [None]:
print (z[0])

['F' 52 'Latino' 'Grad' '25K --50K' 'South' 'City Hotel' 3 3 0 'HB' 'No'
 'No' 'No Deposit' 'Direct' 'Yes' 'Yes' 10 153 11 21 1]


In [None]:
#ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [0,2,3,4,5,6,10,11,12,13,14,15,16,19])], remainder='passthrough')
z = ct.transform(z)      #hot encode lable the categorical data

In [None]:
print (z[0])
print (z[0][50])

[1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 52 3 3 0 10 153
 21 1]
52


In [None]:
z[:, 50:] = sc.transform(z[:, 50:])       #use the same scaler

In [None]:
y_pred = classifier.predict(z)            #predict for the test set using the trained classifier

In [None]:
print (y_pred)

[1 2 1 ... 1 1 1]


In [None]:
import csv                                  #write out to a csv file
with open('results.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    results = []
    for i in y_pred:
      writer.writerow([i])
      if i not in results:
        results.append(i)
        print (i)

1
2
3
