# 1. Data Inspectation

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

Acknowledgements

The data is originally from the article Hotel Booking Demand Datasets, written by Nuno Antonio, Ana Almeida, and Luis Nunes for Data in Brief, Volume 22, February 2019.https://www.sciencedirect.com/science/article/pii/S2352340918315191

The data was downloaded and cleaned by Thomas Mock and Antoine Bichat for #TidyTuesday during the week of February 11th, 2020.https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-11/readme.md

raw data: https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv

Task: cluster cancel and non- cancel  

Reference:
https://www.kaggle.com/jessemostipak/hotel-booking-demand/notebooks


In [None]:
df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
df.head()

In [None]:
df.info()

In [None]:
nulls = df.isnull().sum()
nulls[nulls > 0]
#currently only 4 columns has missing values

In [None]:
df = df.replace('Undefined', np.NaN)
# there are few coloumns have Undefined value instead of NaN,
#replace 'Undefined' to NaN in the dataframe

In [None]:
#calculate the total missing values across the whole dataset
df.isnull().sum().sum()/(len(df.index)*31)


# 2. Data Preprocessing for missing values

#### (1) Drop unhelpful columns & rows

In [None]:
percentage = df.isnull().sum()/ len(df)
percentage.sort_values(ascending=False).head()

In [None]:
#company has 94.4% missing values, not helpful, drop the column
df.drop(['company'], axis=1, inplace=True)

In [None]:
# reservation_status_date contain a lot of variety 
df.drop(['reservation_status_date'], axis=1, inplace=True)  # objects & 926 varieties 
# reservation_status includes 'Canceled' feature 
#By keeping reservation_status in data,
# it is possible to achieve 100% accuracy rate because that feature is direct way to predict cancellations
# so, drop the reservation_status coumns
df.drop(['reservation_status'], axis=1, inplace=True)


In [None]:
df = df.dropna()

#### (2) inspect all the columns unqiue values for replacing the missing values

In [None]:
#transform column to binary value
df['hotel'] = df['hotel'].map({'Resort Hotel':0, 'City Hotel':1}).astype(int)

df['arrival_date_month'] = df['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
                                                            'August':8, 'September':9, 'October':10, 'November':11, 'December':12}).astype(int)

In [None]:
# Since country colomn has high varity data,so need to transfer to numerical data
# transfer to catergorical data first, then transfer to numeric data
df['country'] = df['country'].astype('category')
df['country'] = df['country'].cat.codes

In [None]:
#inspect data again
df.info()

In [None]:
# create new colomns: 'is_family' , 'deposit_given', 'total_nights'
def family_check(df):
    if ((df['adults'] > 0) & (df['children'] > 0)):
        val = 1
    elif ((df['adults'] > 0) & (df['babies'] > 0)):
        val = 1
    else:
        val = 0
    return val

def deposit(df):
    if ((df['deposit_type'] == 'No Deposit') | (df['deposit_type'] == 'Refundable')):
        return 0
    else:
        return 1
    
def previous_cancellations_check(df):
    if df['previous_cancellations'] == 0:
        return 0
    else:
        return 1
    
def previous_bookings_not_canceled_check(df):
    if df['previous_bookings_not_canceled'] == 0:
        return 1
    else:
        return 0    
    
def booking_changed_check(df):
    if df['booking_changes'] == 0:
        return 0
    else:
        return 1    
    
def feature(df):
    # create new column 'is_family' base on 'adults', 'children', 'babies'
    df['is_family'] = df.apply(family_check, axis = 1)
    # create new column 'deposit_given' base on 'deposit_type'
    df['deposit_given'] = df.apply(deposit, axis=1)
    df['previous_cancelled'] = df.apply(previous_cancellations_check, axis=1)
    df['previous_bookings_not_canceled_check'] = df.apply(previous_bookings_not_canceled_check, axis=1)
    df['booking_changed'] = df.apply(booking_changed_check, axis=1)
    # create new column 'total_nights' base on 'stays_in_weekend_nights' and 'stays_in_week_nights'
    df['total_nights'] = df['stays_in_weekend_nights']+ df['stays_in_week_nights']
    df['booking_times'] = df['previous_cancellations'] + df['previous_bookings_not_canceled']
    return df

df = feature(df)

In [None]:
# since we create 'deposit_given' column from 'deposit_type', so can drop 'deposit_type'
df.drop(['deposit_type'], axis=1, inplace=True)
df.drop(['previous_cancellations'], axis=1, inplace=True)
df.drop(['previous_bookings_not_canceled'], axis=1, inplace=True)
df.drop(['booking_changes'], axis=1, inplace=True)

In [None]:
# check the correlation in the dataset with "is_canceled"
corr_matrix = df.corr()
corr_matrix['is_canceled'].sort_values(ascending=False)
# can see 'arrival_date_month','arrival_date_week_number','arrival_date_year',
#          'children','stays_in_week_nights','arrival_date_day_of_month','total_nights'
# have very low correlation(< 0.01%) with 'is_cancelled', whcih is not significant, 
# so it a good idea to drop these columns for further prediction

In [None]:
# drop the colomns with low correlation(< 0.01%) 

df.drop(['arrival_date_month','arrival_date_week_number',
          'stays_in_week_nights','children','arrival_date_year','total_nights',
        'arrival_date_day_of_month'], axis=1, inplace=True)

In [None]:
df.info()

In [None]:
df = pd.get_dummies(data = df, columns = ['meal', 'market_segment', 'distribution_channel',
                                            'reserved_room_type', 'assigned_room_type', 'customer_type'])

In [None]:
#inspect data
df.shape
df.describe()

#### (3). Data Preprocessing for ML

In [None]:
# feature set and targer set
X = df.drop('is_canceled', axis = 1)
y = df['is_canceled']

In [None]:
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

#### (4) Data scaling

##### Choosing scaling
##### use MixMax scaling in order to normalize the data set. The data in our data set are spread across a wide range of values, which might result in various features affecting the final result more than the other feature. MixMax scaling reduces this effect by re-scaling the data to a specificed range of values, in this case 0-1

In [None]:
#data scalering - StandardScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# 3. Classification

####  Evaluation strategy: The training set score and test set scores are close, the highest test set score among all models is better

###  (1) KNN Classification

In [None]:
#GridsearchCV and cross validation searching for KNN hypterparameter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 11)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5,return_train_score = False,n_jobs = -1)# return_train_score = True,
knn_cv.fit(X_train, y_train)
knn_cv.best_params_

In [None]:
import pandas as pd
knn_cv_result = pd.DataFrame(knn_cv.cv_results_)
knn_cv_result.head()

In [None]:
#k=6 is the best hyperparameter, applied this value in the model
clf = KNeighborsClassifier(n_neighbors=6,n_jobs = -1)
clf.fit(X_train, y_train)
print("Training set score: {:.3f}".format(clf.score(X_train, y_train)))
print("Test set accuracy: {:.3f}".format(clf.score(X_test, y_test)))
print("Best parameters: {}".format(knn_cv.best_params_))
print("Best cross-validation score: {:.4f}".format(knn_cv.best_score_))

### (2) Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":[0.001, 0.01, 0.1, 1, 10, 100, 1000], "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression(solver='liblinear')
logreg_cv=GridSearchCV(logreg,grid,cv=5,return_train_score = False ,n_jobs = -1)
logreg_cv.fit(X_train,y_train)
logreg_cv.best_params_

In [None]:
import pandas as pd
logreg_cv_result = pd.DataFrame(logreg_cv.cv_results_)
logreg_cv_result.head()

In [None]:
#use best parameter C value 
logreg1 = LogisticRegression(C=10, penalty = 'l1',solver='liblinear',random_state=0).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg1.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg1.score(X_test, y_test)))
print("Best parameters: {}".format(logreg_cv.best_params_))
print("Best cross-validation score: {:.4f}".format(logreg_cv.best_score_))

### (3) Linear Support Vector Machine

In [None]:
#select best hyperparameter #slow
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
Cs = [0.1, 1, 10, 100]
param_grid = {'C': Cs}
linearSVC = GridSearchCV(LinearSVC(max_iter=500000), param_grid, cv=5,return_train_score = False,n_jobs = -1)
linearSVC.fit(X_train, y_train)
linearSVC.best_params_

In [None]:
import pandas as pd
linearSVC_result = pd.DataFrame(linearSVC.cv_results_)
linearSVC_result.head()

In [None]:
#use best parameter C value 
svm = LinearSVC(C=10).fit(X_train, y_train)
print("Training set score: {:.3f}".format(svm.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svm.score(X_test, y_test)))
print("Best parameters: {}".format(linearSVC.best_params_))
print("Best cross-validation score: {:.4f}".format(linearSVC.best_score_))

### (4) Kerenilzed Support Vector Machine (rbf, poly, and linear)

In [None]:
#linear hyperparameter selection 
from sklearn.svm import SVC
Cs = [0.01, 0.1, 1, 10, 100]
param_grid = {'C': Cs}
kerenl_lin = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5,return_train_score = False,n_jobs = -1)
kerenl_lin.fit(X_train, y_train)
print("The best classifier is: ", kerenl_lin.best_params_)

In [None]:
# GridSearchCV(cv=5, estimator=SVC(kernel='linear'), n_jobs=-1,
#              param_grid={'C': [0.01, 0.1, 1, 10, 100]})
# The best classifier is:  {'C': 100}

In [None]:
import pandas as pd
kerenl_lin_result = pd.DataFrame(kerenl_lin.cv_results_)
kerenl_lin_result.head()

In [None]:
#use best parameter C value 
svc = SVC(kernel='linear', C=100,gamma='auto').fit(X_train, y_train)
print("Training set score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svc.score(X_test, y_test)))
print("Best parameters: {}".format(kerenl_lin.best_params_))
print("Best cross-validation score: {:.4f}".format(kerenl_lin.best_score_))

In [None]:
# Training set score: 0.797
# Test set score: 0.799
# Best parameters: {'C': 100}
# Best cross-validation score: 0.7962

In [None]:
#rbf hypterparameter selection 
C_range = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'C': C_range}
kernel_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5,return_train_score = False, n_jobs = -1)
kernel_rbf.fit(X_train, y_train)
print("The best classifier is: ", kernel_rbf.best_estimator_)

In [None]:
# GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
#              param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})
# The best classifier is:  SVC(C=100)

In [None]:
import pandas as pd
kernel_rbf_result = pd.DataFrame(kernel_rbf.cv_results_)
kernel_rbf_result.head()

In [None]:
#use best parameter C value 
svc = SVC(kernel='rbf', C=100,gamma='auto').fit(X_train, y_train)
print("Training set score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svc.score(X_test, y_test)))
print("Best parameters: {}".format(kernel_rbf.best_params_))
print("Best cross-validation score: {:.4f}".format(kernel_rbf.best_score_))

In [None]:
# Training set score: 0.822
# Test set score: 0.825
# Best parameters: {'C': 100}
# Best cross-validation score: 0.8232

In [None]:
#poly hyperparameter selection 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

Cs = [0.1, 1, 10, 100]
param_grid = {'C': Cs}
kernel_poly = GridSearchCV(SVC(kernel='poly'), param_grid, cv=3,return_train_score = False,n_jobs = -1)
kernel_poly.fit(X_train, y_train)
print("The best classifier is: ", kernel_poly.best_params_)

In [None]:
# GridSearchCV(cv=3, estimator=SVC(kernel='poly'), n_jobs=-1,
#              param_grid={'C': [0.1, 1, 10, 100]})
# The best classifier is:  {'C': 100}

In [None]:
import pandas as pd
kernel_poly_result = pd.DataFrame(kernel_poly.cv_results_)
kernel_poly_result.head()

In [None]:
#use best parameter C value 
svc = SVC(kernel = 'poly',C=100,gamma='auto').fit(X_train, y_train)
print("Training set score: {:.3f}".format(svc.score(X_train, y_train)))
print("Test set score: {:.3f}".format(svc.score(X_test, y_test)))
print("Best parameters: {}".format(kernel_poly.best_params_))
print("Best cross-validation score: {:.4f}".format(kernel_poly.best_score_))

In [None]:
# Training set score: 0.815
# Test set score: 0.817
# Best parameters: {'C': 100}
# Best cross-validation score: 0.8228

### (5) Decision Tree

In [None]:
#decision tree hyperparameter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}
clf_tree=DecisionTreeClassifier()
grid_search=GridSearchCV(clf_tree,parameters, cv=10,return_train_score = False,n_jobs = -1)
grid_search.fit(X_train, y_train)
print("The best classifier is: ", grid_search.best_params_)

In [None]:
# GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
#              param_grid={'max_depth': range(1, 20, 2),
#                          'min_samples_split': range(10, 500, 20)})
# The best classifier is:  {'max_depth': 19, 'min_samples_split': 10}

In [None]:
import pandas as pd
grid_search_result = pd.DataFrame(grid_search.cv_results_)
grid_search_result.head()

In [None]:
#use best parameters values 
clf_tree=DecisionTreeClassifier(max_depth=19,min_samples_split=10).fit(X_train, y_train)
print("Training clf_treeset score: {:.3f}".format(clf_tree.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf_tree.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

In [None]:
# Training clf_treeset score: 0.888
# Test set score: 0.849
# Best parameters: {'max_depth': 19, 'min_samples_split': 10}
# Best cross-validation score: 0.8478

### (6) Random Forest

In [None]:
# random forest hyperparameter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}
clf_treeR=RandomForestClassifier()
grid_searchR=GridSearchCV(clf_treeR,parameters, cv=10,return_train_score = False,n_jobs = -1)
grid_searchR.fit(X_train, y_train)
print("The best classifier is: ", grid_searchR.best_params_)

In [None]:
import pandas as pd
grid_searchR_result = pd.DataFrame(grid_searchR.cv_results_)
grid_searchR_result.head()

In [None]:
#use best parameters values 
clf_treeR=RandomForestClassifier(max_depth=19,min_samples_split=10).fit(X_train, y_train)
print("Training clf_treeset score: {:.3f}".format(clf_treeR.score(X_train, y_train)))
print("Test set score: {:.3f}".format(clf_treeR.score(X_test, y_test)))
print("Best parameters: {}".format(grid_searchR.best_params_))
print("Best cross-validation score: {:.4f}".format(grid_searchR.best_score_))

# 4. Find the best model

##### 1. Knn: train 0.873, test 0.835, Best cross-validation score 0.8286

##### 2. Logistic Regression:  train  0.794,  test 0.796, Best cross-validation score 0.7935

##### 3. Linear Support Vector Machine : train 0.794, test 0.795, Best cross-validation score: 0.7932

##### 4.Kerenilzed Support Vector Machine (rbf, poly, and linear):   
##### (1) Linear train 0.797, test 0.799, Best cross-validation score: 0.7962 
##### (2) Rbf train 0.822, test 0.825, Best cross-validation score: 0.8232
##### (3) Poly train 0.815 test 0.817, Best cross-validation score: 0.8228

#### 5. decision tree: train 0.888  test 0.849, Best cross-validation score: 0.8478

#### 6. random forest: train 0.881  test 0.864, Best cross-validation score: 0.8617

### Random forest model has the highest training and test score. In addition, it also has the highest Best cross-validation score-- 0.8617. Therefore, Random forest model is the best option

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = grid_searchR.predict(X_test)
print('accuracy_score: ', accuracy_score(y_test, y_pred))
print('roc_auc_score: ', roc_auc_score(y_test, grid_searchR.predict_proba(X_test)[:,1]))