#  Kaggle - Hotel Bookings Demand (IML Seminar 20/21)

## Modelling

### Split dataset into train/test

Use typical ratio 70:30 or 80:20 (in this case 80:20).

In [3]:
import pandas as pd

df = pd.read_csv("datasets/dataset_preprocessed.csv") 
df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,...,assigned_room_type_H,assigned_room_type_I,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0,7,27,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,13,27,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,14,27,1,0,2,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,14,27,1,0,2,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,27,1,0,2,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
from sklearn.model_selection import train_test_split

labels = df.iloc[:,0]
features = df.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)
print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

(93541, 82) (23386, 82) (93541,) (23386,)


### Training and Testing

Identify the method we are going to use (RF, DT, SVM, NN, etc) and apply CV.

In [4]:
#Linear SVC
#Parameters: regularization (C): range: (0.01,100) init, (0.002) tuned.

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np

svm = LinearSVC()

svm.get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])

In [7]:
# Default model.
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

# report_dict = classification_report(y_test, y_pred, output_dict=True)
# report_plain_svm_df = pd.DataFrame(report_dict)
# report_plain_svm_df

0.7899598050115454 0.8698566374976727 0.5258300506471582 0.6554433221099888




In [14]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_plain_svm_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_plain_svm_df

Unnamed: 0,Metric,Value
0,accuracy,0.78996
1,precision,0.869857
2,recall,0.52583
3,f1,0.655443


In [13]:
report_plain_svm_df.to_csv('results/plain_linear_svm.csv', index=False)

In [6]:
# Optimized model. GridSearch to determine the optimal value of C.

svm_opt = LinearSVC()

param_grid = {
              'C':(0.01,0.1,1,10,100)
             }

linearSVC = GridSearchCV(svm_opt,param_grid,cv=5,return_train_score=True,
                             scoring=['accuracy','f1'],refit=False,n_jobs=-1)
linearSVC.fit(X_train,y_train)
#linearSVC.coef_
#linearSVC.intercept_

# bestlinearSVC = linearSVC.best_estimator_
# bestlinearSVC.fit(X_train,y_train)
# bestlinearSVC.score(X_train,y_train)

# print(bestlinearSVC)

GridSearchCV(cv=5, estimator=LinearSVC(), n_jobs=-1,
             param_grid={'C': (0.01, 0.1, 1, 10, 100)}, refit=False,
             return_train_score=True, scoring=['accuracy', 'f1'])

In [7]:
report_opt_svm_df = pd.DataFrame(linearSVC.cv_results_)
report_opt_svm_df.sort_values(by=['rank_test_f1'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
0,28.91015,0.193977,0.042961,0.016829,0.01,{'C': 0.01},0.805014,0.811364,0.809226,0.783996,...,0.693125,0.037334,1,0.707855,0.713939,0.720338,0.616344,0.710103,0.693716,0.038917
3,29.376717,0.136721,0.022738,0.000728,10.0,{'C': 10},0.69512,0.753581,0.54335,0.563235,...,0.621615,0.059963,2,0.666381,0.514693,0.617672,0.625882,0.678563,0.620638,0.057807
1,29.137687,0.1538,0.023038,0.00187,0.1,{'C': 0.1},0.755412,0.70836,0.775711,0.781056,...,0.602194,0.046623,3,0.52933,0.637278,0.6681,0.602864,0.591057,0.605726,0.046788
4,26.727593,5.335462,0.019072,0.004069,100.0,{'C': 100},0.651772,0.689491,0.750053,0.78437,...,0.590035,0.079282,4,0.651285,0.68769,0.493984,0.619602,0.497696,0.590052,0.079893
2,29.230575,0.123172,0.02106,0.001442,1.0,{'C': 1},0.706131,0.737171,0.755559,0.573231,...,0.537899,0.109155,5,0.388928,0.461092,0.515812,0.630518,0.686681,0.536606,0.10889


In [8]:
report_opt_svm_df.sort_values(by=['rank_test_accuracy'])

#So as a first step, we derive that values around 0.01 perform best, let's make it more accurate...

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
0,28.91015,0.193977,0.042961,0.016829,0.01,{'C': 0.01},0.805014,0.811364,0.809226,0.783996,...,0.693125,0.037334,1,0.707855,0.713939,0.720338,0.616344,0.710103,0.693716,0.038917
1,29.137687,0.1538,0.023038,0.00187,0.1,{'C': 0.1},0.755412,0.70836,0.775711,0.781056,...,0.602194,0.046623,3,0.52933,0.637278,0.6681,0.602864,0.591057,0.605726,0.046788
4,26.727593,5.335462,0.019072,0.004069,100.0,{'C': 100},0.651772,0.689491,0.750053,0.78437,...,0.590035,0.079282,4,0.651285,0.68769,0.493984,0.619602,0.497696,0.590052,0.079893
2,29.230575,0.123172,0.02106,0.001442,1.0,{'C': 1},0.706131,0.737171,0.755559,0.573231,...,0.537899,0.109155,5,0.388928,0.461092,0.515812,0.630518,0.686681,0.536606,0.10889
3,29.376717,0.136721,0.022738,0.000728,10.0,{'C': 10},0.69512,0.753581,0.54335,0.563235,...,0.621615,0.059963,2,0.666381,0.514693,0.617672,0.625882,0.678563,0.620638,0.057807


In [9]:
# Reduce the optimization range. Again apply GridSearch.
svm_opt = LinearSVC()

param_grid = {
              'C':np.arange(0.01,0.1+0.01,0.01), 
             }

linearSVC = GridSearchCV(svm_opt,param_grid,cv=5,scoring=['accuracy','f1'],
                             return_train_score=True,refit=False,n_jobs=-1)
linearSVC.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=LinearSVC(), n_jobs=-1,
             param_grid={'C': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ])},
             refit=False, return_train_score=True, scoring=['accuracy', 'f1'])

In [10]:
report_opt_svm_df = pd.DataFrame(linearSVC.cv_results_)
report_opt_svm_df.sort_values(by=['rank_test_f1'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
0,28.734499,0.11804,0.020744,0.002111,0.01,{'C': 0.01},0.810305,0.797306,0.808905,0.810135,...,0.692738,0.026214,1,0.720764,0.6605,0.717102,0.709202,0.66746,0.695006,0.025701
7,29.06536,0.1175,0.022143,0.001374,0.08,{'C': 0.08},0.778502,0.698418,0.789181,0.770526,...,0.668653,0.048054,2,0.598979,0.692152,0.623511,0.736701,0.685959,0.66746,0.049734
4,29.160348,0.076422,0.023161,0.001613,0.05,{'C': 0.05},0.720562,0.768067,0.773733,0.779934,...,0.666067,0.064506,3,0.694609,0.55384,0.700222,0.740317,0.637755,0.665349,0.064635
1,28.947249,0.078984,0.020772,0.001974,0.02,{'C': 0.02},0.773745,0.662604,0.793511,0.796558,...,0.661202,0.03959,4,0.600672,0.677322,0.648015,0.652802,0.722408,0.660244,0.039774
2,29.042072,0.111846,0.0208,0.001892,0.03,{'C': 0.03},0.790101,0.79629,0.758927,0.805431,...,0.653831,0.067936,5,0.655142,0.735524,0.519558,0.684287,0.662918,0.651486,0.071676
5,29.365292,0.286744,0.021255,0.001592,0.06,{'C': 0.060000000000000005},0.765781,0.673509,0.704565,0.786615,...,0.623135,0.065527,6,0.603214,0.679062,0.562162,0.723003,0.563629,0.626214,0.064358
6,29.115512,0.195066,0.021534,0.001292,0.07,{'C': 0.06999999999999999},0.787108,0.686818,0.752726,0.644323,...,0.614789,0.063271,7,0.6266,0.692213,0.503762,0.666457,0.587438,0.615294,0.066147
8,29.190096,0.14288,0.022527,0.000426,0.09,{'C': 0.09},0.784275,0.778865,0.749252,0.692271,...,0.611081,0.059419,8,0.609348,0.612853,0.496182,0.681036,0.645952,0.609074,0.062125
9,25.238023,4.517957,0.019128,0.003904,0.1,{'C': 0.09999999999999999},0.775082,0.759675,0.762722,0.755185,...,0.598884,0.065302,9,0.572231,0.698255,0.538088,0.518732,0.667136,0.598888,0.071223
3,29.136619,0.13909,0.021915,0.001018,0.04,{'C': 0.04},0.767652,0.770152,0.720547,0.75898,...,0.57246,0.071933,10,0.557589,0.57216,0.475721,0.544316,0.699386,0.569834,0.072729


In [11]:
report_opt_svm_df.sort_values(by=['rank_test_accuracy'])

#So as a second step, we confirm that values around 0.01 perform best, let's make it even more accurate...

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
0,28.734499,0.11804,0.020744,0.002111,0.01,{'C': 0.01},0.810305,0.797306,0.808905,0.810135,...,0.692738,0.026214,1,0.720764,0.6605,0.717102,0.709202,0.66746,0.695006,0.025701
2,29.042072,0.111846,0.0208,0.001892,0.03,{'C': 0.03},0.790101,0.79629,0.758927,0.805431,...,0.653831,0.067936,5,0.655142,0.735524,0.519558,0.684287,0.662918,0.651486,0.071676
9,25.238023,4.517957,0.019128,0.003904,0.1,{'C': 0.09999999999999999},0.775082,0.759675,0.762722,0.755185,...,0.598884,0.065302,9,0.572231,0.698255,0.538088,0.518732,0.667136,0.598888,0.071223
4,29.160348,0.076422,0.023161,0.001613,0.05,{'C': 0.05},0.720562,0.768067,0.773733,0.779934,...,0.666067,0.064506,3,0.694609,0.55384,0.700222,0.740317,0.637755,0.665349,0.064635
3,29.136619,0.13909,0.021915,0.001018,0.04,{'C': 0.04},0.767652,0.770152,0.720547,0.75898,...,0.57246,0.071933,10,0.557589,0.57216,0.475721,0.544316,0.699386,0.569834,0.072729
1,28.947249,0.078984,0.020772,0.001974,0.02,{'C': 0.02},0.773745,0.662604,0.793511,0.796558,...,0.661202,0.03959,4,0.600672,0.677322,0.648015,0.652802,0.722408,0.660244,0.039774
7,29.06536,0.1175,0.022143,0.001374,0.08,{'C': 0.08},0.778502,0.698418,0.789181,0.770526,...,0.668653,0.048054,2,0.598979,0.692152,0.623511,0.736701,0.685959,0.66746,0.049734
5,29.365292,0.286744,0.021255,0.001592,0.06,{'C': 0.060000000000000005},0.765781,0.673509,0.704565,0.786615,...,0.623135,0.065527,6,0.603214,0.679062,0.562162,0.723003,0.563629,0.626214,0.064358
6,29.115512,0.195066,0.021534,0.001292,0.07,{'C': 0.06999999999999999},0.787108,0.686818,0.752726,0.644323,...,0.614789,0.063271,7,0.6266,0.692213,0.503762,0.666457,0.587438,0.615294,0.066147
8,29.190096,0.14288,0.022527,0.000426,0.09,{'C': 0.09},0.784275,0.778865,0.749252,0.692271,...,0.611081,0.059419,8,0.609348,0.612853,0.496182,0.681036,0.645952,0.609074,0.062125


In [12]:
# Reduce the optimization range. Again apply GridSearch.
svm_opt = LinearSVC()

param_grid = {
              'C':np.arange(0.001,0.01+0.001,0.001), 
             }

linearSVC = GridSearchCV(svm_opt,param_grid,cv=5,scoring=['accuracy','f1'],
                             return_train_score=True,refit=False,n_jobs=-1)
linearSVC.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=LinearSVC(), n_jobs=-1,
             param_grid={'C': array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
       0.01 ])},
             refit=False, return_train_score=True, scoring=['accuracy', 'f1'])

In [13]:
report_opt_svm_df = pd.DataFrame(linearSVC.cv_results_)
report_opt_svm_df.sort_values(by=['rank_test_f1'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
3,40.297651,2.774621,0.025328,0.006263,0.004,{'C': 0.004},0.812123,0.811792,0.810455,0.812112,...,0.716659,0.007705,1,0.719971,0.731742,0.723763,0.709504,0.704229,0.717842,0.009876
4,33.763945,1.758902,0.035502,0.01685,0.005,{'C': 0.005},0.810198,0.809279,0.810562,0.810509,...,0.71522,0.008089,2,0.713735,0.731812,0.717449,0.702847,0.716242,0.716417,0.009268
2,34.923083,0.559286,0.026629,0.008544,0.003,{'C': 0.003},0.811321,0.811418,0.80944,0.81409,...,0.711786,0.003668,3,0.715085,0.705993,0.715174,0.71419,0.711766,0.712441,0.003451
6,36.2974,1.155842,0.028668,0.006635,0.007,{'C': 0.007},0.80202,0.807088,0.809493,0.803881,...,0.711451,0.020214,4,0.734388,0.708698,0.724043,0.670181,0.717215,0.710905,0.022032
1,30.641839,4.476374,0.02142,0.003534,0.002,{'C': 0.002},0.811107,0.811204,0.808638,0.814732,...,0.709846,0.002951,5,0.71345,0.71225,0.712766,0.710455,0.706854,0.711155,0.002369
7,38.058834,1.196976,0.025121,0.004468,0.008,{'C': 0.008},0.806671,0.802224,0.80944,0.812326,...,0.706922,0.019737,6,0.697576,0.674506,0.726627,0.718753,0.718922,0.707277,0.019025
0,26.759702,0.080127,0.020627,0.001746,0.001,{'C': 0.001},0.809878,0.8096,0.80789,0.813021,...,0.704706,0.002176,7,0.705618,0.707948,0.708564,0.704705,0.702802,0.705927,0.002116
8,31.620199,0.599881,0.021839,0.001202,0.009,{'C': 0.009000000000000001},0.795446,0.801155,0.792602,0.810723,...,0.698162,0.02869,8,0.661115,0.673094,0.735759,0.720612,0.708025,0.699721,0.028297
5,37.408272,0.779904,0.027247,0.00977,0.006,{'C': 0.006},0.811481,0.802598,0.797573,0.804843,...,0.683523,0.019862,9,0.723962,0.675922,0.676604,0.675791,0.671586,0.684773,0.019674
9,27.64337,5.064725,0.018828,0.004563,0.01,{'C': 0.010000000000000002},0.789246,0.771167,0.776673,0.799391,...,0.659309,0.044886,10,0.659715,0.577428,0.712733,0.661695,0.684095,0.659133,0.045119


In [14]:
report_opt_svm_df.sort_values(by=['rank_test_accuracy'])

#Based on accuracy, C=0.003 is the best value (we could keep exploring, but...)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
2,34.923083,0.559286,0.026629,0.008544,0.003,{'C': 0.003},0.811321,0.811418,0.80944,0.81409,...,0.711786,0.003668,3,0.715085,0.705993,0.715174,0.71419,0.711766,0.712441,0.003451
1,30.641839,4.476374,0.02142,0.003534,0.002,{'C': 0.002},0.811107,0.811204,0.808638,0.814732,...,0.709846,0.002951,5,0.71345,0.71225,0.712766,0.710455,0.706854,0.711155,0.002369
3,40.297651,2.774621,0.025328,0.006263,0.004,{'C': 0.004},0.812123,0.811792,0.810455,0.812112,...,0.716659,0.007705,1,0.719971,0.731742,0.723763,0.709504,0.704229,0.717842,0.009876
4,33.763945,1.758902,0.035502,0.01685,0.005,{'C': 0.005},0.810198,0.809279,0.810562,0.810509,...,0.71522,0.008089,2,0.713735,0.731812,0.717449,0.702847,0.716242,0.716417,0.009268
0,26.759702,0.080127,0.020627,0.001746,0.001,{'C': 0.001},0.809878,0.8096,0.80789,0.813021,...,0.704706,0.002176,7,0.705618,0.707948,0.708564,0.704705,0.702802,0.705927,0.002116
7,38.058834,1.196976,0.025121,0.004468,0.008,{'C': 0.008},0.806671,0.802224,0.80944,0.812326,...,0.706922,0.019737,6,0.697576,0.674506,0.726627,0.718753,0.718922,0.707277,0.019025
6,36.2974,1.155842,0.028668,0.006635,0.007,{'C': 0.007},0.80202,0.807088,0.809493,0.803881,...,0.711451,0.020214,4,0.734388,0.708698,0.724043,0.670181,0.717215,0.710905,0.022032
5,37.408272,0.779904,0.027247,0.00977,0.006,{'C': 0.006},0.811481,0.802598,0.797573,0.804843,...,0.683523,0.019862,9,0.723962,0.675922,0.676604,0.675791,0.671586,0.684773,0.019674
8,31.620199,0.599881,0.021839,0.001202,0.009,{'C': 0.009000000000000001},0.795446,0.801155,0.792602,0.810723,...,0.698162,0.02869,8,0.661115,0.673094,0.735759,0.720612,0.708025,0.699721,0.028297
9,27.64337,5.064725,0.018828,0.004563,0.01,{'C': 0.010000000000000002},0.789246,0.771167,0.776673,0.799391,...,0.659309,0.044886,10,0.659715,0.577428,0.712733,0.661695,0.684095,0.659133,0.045119


In [15]:
# Optimized model.
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

svm = LinearSVC(C=0.003)

svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

# report_dict = classification_report(y_test, y_pred, output_dict=True)
# report_opt_svm_df = pd.DataFrame(report_dict)
# report_opt_svm_df

0.8101855811169075 0.82105719237435 0.6398424310635903 0.7192105762540325




In [16]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_opt_svm_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_opt_svm_df

Unnamed: 0,Metric,Value
0,accuracy,0.810186
1,precision,0.821057
2,recall,0.639842
3,f1,0.719211


In [18]:
report_opt_svm_df.to_csv('results/optimized_linear_svm.csv',index=False)

In [1]:
#Gradient Boosting Classifier(https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/)
#Parameters: learning rate (learning_rate): range: (0.05,0.2)
#            stages (n_estimators): range: (10,100)
#            subsample (subsample): range: (0.7,0.9)
#            maximum depth (max_depth): range: (2,15) 

from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

grad_boost = GradientBoostingClassifier()

grad_boost.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'presort', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [26]:
# Default model.
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

grad_boost.fit(X_train,y_train)
y_pred = grad_boost.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

# report_dict = classification_report(y_test, y_pred, output_dict=True)
# report_plain_grad_boost_df = pd.DataFrame(report_dict)
# report_plain_grad_boost_df

0.8151885743607287 0.863585657370518 0.6099043331457513 0.7149076517150396


In [27]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_plain_grad_boost_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_plain_grad_boost_df

Unnamed: 0,Metric,Value
0,accuracy,0.815189
1,precision,0.863586
2,recall,0.609904
3,f1,0.714908


In [23]:
report_plain_grad_boost_df.to_csv('results/plain_grad_boost.csv',index=False)

In [28]:
# Gridsearch to determine the value of learning_rate, n_estimators, subsample and max_depth
param_grid = {
              'learning_rate':(0.05,0.1,0.15,0.2),
              'n_estimators':(10,100),
              'subsample':(0.7,0.8,0.9),
              'max_depth':(2,3,4,5)
             }

#RandomGridSearchCV if needed
grad_boostCV = GridSearchCV(grad_boost,param_grid,cv=5,scoring=['accuracy','f1'],
                             return_train_score=True,refit=False,n_jobs=-1)
grad_boostCV.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': (0.05, 0.1, 0.15, 0.2),
                         'max_depth': (2, 3, 4, 5), 'n_estimators': (10, 100),
                         'subsample': (0.7, 0.8, 0.9)},
             refit=False, return_train_score=True, scoring=['accuracy', 'f1'])

In [29]:
report_opt_grad_boost_df = pd.DataFrame(grad_boostCV.cv_results_)
report_opt_grad_boost_df.sort_values(by=['rank_test_f1'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,split0_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
94,35.349153,0.533575,0.111097,0.002486,0.2,5,100,0.8,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti...",0.854295,...,0.792014,0.001983,1,0.808270,0.807573,0.809079,0.809286,0.804395,0.807720,0.001771
95,33.883182,5.569282,0.083052,0.023620,0.2,5,100,0.9,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti...",0.851729,...,0.789607,0.001201,2,0.805248,0.809963,0.807040,0.806354,0.802900,0.806301,0.002308
93,31.768985,0.458985,0.109215,0.001050,0.2,5,100,0.7,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti...",0.851729,...,0.788286,0.001582,3,0.806907,0.809207,0.805721,0.806634,0.802908,0.806275,0.002038
70,48.868341,1.728967,0.173005,0.024521,0.15,5,100,0.8,"{'learning_rate': 0.15, 'max_depth': 5, 'n_est...",0.848308,...,0.780033,0.002182,4,0.791596,0.793928,0.791268,0.791826,0.789814,0.791686,0.001322
69,41.023169,2.289100,0.136955,0.012478,0.15,5,100,0.7,"{'learning_rate': 0.15, 'max_depth': 5, 'n_est...",0.847400,...,0.778541,0.002921,5,0.793899,0.791272,0.793681,0.790981,0.791872,0.792341,0.001219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,1.904703,0.156968,0.034397,0.002300,0.1,2,10,0.9,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",0.752365,...,0.504571,0.006636,92,0.508665,0.508363,0.497847,0.507280,0.500881,0.504607,0.004411
24,1.724722,0.264839,0.035997,0.006703,0.1,2,10,0.7,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",0.752205,...,0.504540,0.006539,93,0.508507,0.508363,0.497967,0.507280,0.501121,0.504648,0.004306
2,2.034631,0.135207,0.033596,0.002123,0.05,2,10,0.9,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.748250,...,0.495624,0.004895,94,0.495863,0.495903,0.494263,0.494492,0.497673,0.495639,0.001222
1,1.667932,0.032232,0.034986,0.004309,0.05,2,10,0.8,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.748250,...,0.495624,0.004895,94,0.495863,0.495903,0.494263,0.494492,0.497673,0.495639,0.001222


In [30]:
report_opt_grad_boost_df.sort_values(by=['rank_test_accuracy'])

#So we conclude that lr=0.2, n_estim=100, subsample=0.8 and max_depth=5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,split0_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
94,35.349153,0.533575,0.111097,0.002486,0.2,5,100,0.8,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti...",0.854295,...,0.792014,0.001983,1,0.808270,0.807573,0.809079,0.809286,0.804395,0.807720,0.001771
95,33.883182,5.569282,0.083052,0.023620,0.2,5,100,0.9,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti...",0.851729,...,0.789607,0.001201,2,0.805248,0.809963,0.807040,0.806354,0.802900,0.806301,0.002308
93,31.768985,0.458985,0.109215,0.001050,0.2,5,100,0.7,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti...",0.851729,...,0.788286,0.001582,3,0.806907,0.809207,0.805721,0.806634,0.802908,0.806275,0.002038
70,48.868341,1.728967,0.173005,0.024521,0.15,5,100,0.8,"{'learning_rate': 0.15, 'max_depth': 5, 'n_est...",0.848308,...,0.780033,0.002182,4,0.791596,0.793928,0.791268,0.791826,0.789814,0.791686,0.001322
69,41.023169,2.289100,0.136955,0.012478,0.15,5,100,0.7,"{'learning_rate': 0.15, 'max_depth': 5, 'n_est...",0.847400,...,0.778541,0.002921,5,0.793899,0.791272,0.793681,0.790981,0.791872,0.792341,0.001219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,1.904703,0.156968,0.034397,0.002300,0.1,2,10,0.9,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",0.752365,...,0.504571,0.006636,92,0.508665,0.508363,0.497847,0.507280,0.500881,0.504607,0.004411
24,1.724722,0.264839,0.035997,0.006703,0.1,2,10,0.7,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",0.752205,...,0.504540,0.006539,93,0.508507,0.508363,0.497967,0.507280,0.501121,0.504648,0.004306
2,2.034631,0.135207,0.033596,0.002123,0.05,2,10,0.9,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.748250,...,0.495624,0.004895,94,0.495863,0.495903,0.494263,0.494492,0.497673,0.495639,0.001222
1,1.667932,0.032232,0.034986,0.004309,0.05,2,10,0.8,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.748250,...,0.495624,0.004895,94,0.495863,0.495903,0.494263,0.494492,0.497673,0.495639,0.001222


In [5]:
# Gridsearch to determine the value of learning_rate, n_estimators, subsample and max_depth
param_grid = {
              'learning_rate':[0.2],
              'n_estimators':[10,100,200,300],
              'subsample':[0.8],
              'max_depth':(5,7,9,11,13,15)
             }

#RandomGridSearchCV if needed
grad_boostCV = GridSearchCV(grad_boost,param_grid,cv=5,scoring=['accuracy','f1'],
                             return_train_score=True,refit=False,n_jobs=-1)
grad_boostCV.fit(X_train,y_train)



GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.2],
                         'max_depth': (5, 7, 9, 11, 13, 15),
                         'n_estimators': [10, 100, 200, 300],
                         'subsample': [0.8]},
             refit=False, return_train_score=True, scoring=['accuracy', 'f1'])

In [6]:
report_opt_grad_boost_df = pd.DataFrame(grad_boostCV.cv_results_)
report_opt_grad_boost_df.sort_values(by=['rank_test_f1'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,split0_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
15,252.660877,0.806545,0.465952,0.004717,0.2,11,300,0.8,"{'learning_rate': 0.2, 'max_depth': 11, 'n_est...",0.878668,...,0.836786,0.003371,1,0.995146,0.994865,0.994975,0.995201,0.994985,0.995034,0.000122
11,198.630946,0.809021,0.386275,0.00501,0.2,9,300,0.8,"{'learning_rate': 0.2, 'max_depth': 9, 'n_esti...",0.877439,...,0.835714,0.002749,2,0.987709,0.986861,0.986967,0.98658,0.987617,0.987147,0.000441
14,166.261324,0.293012,0.311046,0.004361,0.2,11,200,0.8,"{'learning_rate': 0.2, 'max_depth': 11, 'n_est...",0.88118,...,0.835526,0.002244,3,0.993164,0.992659,0.992656,0.99256,0.992663,0.992741,0.000215
18,205.896089,0.689119,0.38591,0.005358,0.2,13,200,0.8,"{'learning_rate': 0.2, 'max_depth': 13, 'n_est...",0.878882,...,0.835345,0.002637,4,0.995096,0.994865,0.994975,0.995257,0.995002,0.995039,0.000131
17,100.754434,0.577519,0.195047,0.002478,0.2,13,100,0.8,"{'learning_rate': 0.2, 'max_depth': 13, 'n_est...",0.877332,...,0.832981,0.002672,5,0.987021,0.98691,0.987036,0.986676,0.98896,0.987321,0.00083
19,311.774236,0.976776,0.575409,0.011531,0.2,13,300,0.8,"{'learning_rate': 0.2, 'max_depth': 13, 'n_est...",0.878401,...,0.8329,0.002002,6,0.99513,0.994913,0.994992,0.995273,0.995002,0.995062,0.000126
22,251.938431,1.282206,0.469477,0.017849,0.2,15,200,0.8,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est...",0.877065,...,0.832789,0.002991,7,0.995095,0.99491,0.995011,0.995272,0.994986,0.995055,0.000124
21,123.352562,0.695087,0.236179,0.006382,0.2,15,100,0.8,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est...",0.875408,...,0.832356,0.00371,8,0.994879,0.994697,0.994689,0.99481,0.994645,0.994744,8.7e-05
23,327.268461,64.003372,0.60729,0.103759,0.2,15,300,0.8,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est...",0.875782,...,0.831601,0.002693,9,0.995148,0.994868,0.994993,0.995272,0.995004,0.995057,0.000139
10,130.391162,0.673063,0.261772,0.002992,0.2,9,200,0.8,"{'learning_rate': 0.2, 'max_depth': 9, 'n_esti...",0.876423,...,0.831089,0.001392,10,0.965107,0.963102,0.961523,0.964101,0.964023,0.963571,0.001205


In [7]:
report_opt_grad_boost_df.sort_values(by=['rank_test_accuracy'])

#So we conclude that lr=0.2, n_estim=300, subsample=0.8 and max_depth=11

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,split0_test_accuracy,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
15,252.660877,0.806545,0.465952,0.004717,0.2,11,300,0.8,"{'learning_rate': 0.2, 'max_depth': 11, 'n_est...",0.878668,...,0.836786,0.003371,1,0.995146,0.994865,0.994975,0.995201,0.994985,0.995034,0.000122
18,205.896089,0.689119,0.38591,0.005358,0.2,13,200,0.8,"{'learning_rate': 0.2, 'max_depth': 13, 'n_est...",0.878882,...,0.835345,0.002637,4,0.995096,0.994865,0.994975,0.995257,0.995002,0.995039,0.000131
14,166.261324,0.293012,0.311046,0.004361,0.2,11,200,0.8,"{'learning_rate': 0.2, 'max_depth': 11, 'n_est...",0.88118,...,0.835526,0.002244,3,0.993164,0.992659,0.992656,0.99256,0.992663,0.992741,0.000215
11,198.630946,0.809021,0.386275,0.00501,0.2,9,300,0.8,"{'learning_rate': 0.2, 'max_depth': 9, 'n_esti...",0.877439,...,0.835714,0.002749,2,0.987709,0.986861,0.986967,0.98658,0.987617,0.987147,0.000441
22,251.938431,1.282206,0.469477,0.017849,0.2,15,200,0.8,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est...",0.877065,...,0.832789,0.002991,7,0.995095,0.99491,0.995011,0.995272,0.994986,0.995055,0.000124
19,311.774236,0.976776,0.575409,0.011531,0.2,13,300,0.8,"{'learning_rate': 0.2, 'max_depth': 13, 'n_est...",0.878401,...,0.8329,0.002002,6,0.99513,0.994913,0.994992,0.995273,0.995002,0.995062,0.000126
21,123.352562,0.695087,0.236179,0.006382,0.2,15,100,0.8,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est...",0.875408,...,0.832356,0.00371,8,0.994879,0.994697,0.994689,0.99481,0.994645,0.994744,8.7e-05
17,100.754434,0.577519,0.195047,0.002478,0.2,13,100,0.8,"{'learning_rate': 0.2, 'max_depth': 13, 'n_est...",0.877332,...,0.832981,0.002672,5,0.987021,0.98691,0.987036,0.986676,0.98896,0.987321,0.00083
23,327.268461,64.003372,0.60729,0.103759,0.2,15,300,0.8,"{'learning_rate': 0.2, 'max_depth': 15, 'n_est...",0.875782,...,0.831601,0.002693,9,0.995148,0.994868,0.994993,0.995272,0.995004,0.995057,0.000139
10,130.391162,0.673063,0.261772,0.002992,0.2,9,200,0.8,"{'learning_rate': 0.2, 'max_depth': 9, 'n_esti...",0.876423,...,0.831089,0.001392,10,0.965107,0.963102,0.961523,0.964101,0.964023,0.963571,0.001205


In [8]:
# Optimized model.
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

grad_boost = GradientBoostingClassifier(learning_rate=0.2,n_estimators=300,subsample=0.8,max_depth=11)

grad_boost.fit(X_train,y_train)
y_pred = grad_boost.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

# report_dict = classification_report(y_test, y_pred, output_dict=True)
# report_opt_grad_boost_df = pd.DataFrame(report_dict)
# report_opt_grad_boost_df

0.8836910972376636 0.8722376524574327 0.8129431626336522 0.8415472445531866


In [9]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_opt_grad_boost_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_opt_grad_boost_df

Unnamed: 0,Metric,Value
0,accuracy,0.883691
1,precision,0.872238
2,recall,0.812943
3,f1,0.841547


In [10]:
report_opt_grad_boost_df.to_csv('results/optimized_grad_boost.csv',index=False)

### New Dataset with GB

#### 3 features 

In [49]:
import pandas as pd

df = pd.read_csv("datasets/dataset_preprocessed.csv") 
df

Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,...,assigned_room_type_H,assigned_room_type_I,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0,342,27,1,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,737,27,1,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,7,27,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,13,27,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,14,27,1,0,2,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118721,0,23,35,30,2,5,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
118722,0,102,35,31,2,5,3,0,0,0,...,0,0,0,0,0,0,0,0,1,0
118723,0,34,35,31,2,5,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
118724,0,109,35,31,2,5,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [50]:
df["previous_cancellation_ratio"] = df["previous_cancellations"]/(df["previous_cancellations"]+df["previous_bookings_not_canceled"]+1e-5)
df["total_guests"] = df["adults"]+df["children"]+df["babies"]
df["total_nights"] = df["stays_in_week_nights"]+df["stays_in_weekend_nights"]
df

Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,...,assigned_room_type_L,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,previous_cancellation_ratio,total_guests,total_nights
0,0,342,27,1,0,0,2,0,0,0,...,0,0,0,0,0,1,0,0.0,2,0
1,0,737,27,1,0,0,2,0,0,0,...,0,0,0,0,0,1,0,0.0,2,0
2,0,7,27,1,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0.0,1,1
3,0,13,27,1,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0.0,1,1
4,0,14,27,1,0,2,2,0,0,0,...,0,0,0,0,0,1,0,0.0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118721,0,23,35,30,2,5,2,0,0,0,...,0,0,0,0,0,1,0,0.0,2,7
118722,0,102,35,31,2,5,3,0,0,0,...,0,0,0,0,0,1,0,0.0,3,7
118723,0,34,35,31,2,5,2,0,0,0,...,0,0,0,0,0,1,0,0.0,2,7
118724,0,109,35,31,2,5,2,0,0,0,...,0,0,0,0,0,1,0,0.0,2,7


In [16]:
from sklearn.model_selection import train_test_split

labels = df.iloc[:,0]
features = df.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)
print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

(94980, 85) (23746, 85) (94980,) (23746,)


In [18]:
# Optimized model.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

grad_boost = GradientBoostingClassifier(learning_rate=0.2,n_estimators=300,subsample=0.8,max_depth=11)

grad_boost.fit(X_train,y_train)
y_pred = grad_boost.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

0.8862124147224796 0.8681775035595634 0.8213964975303099 0.8441393631748962


In [19]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_opt_grad_boost_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_opt_grad_boost_df

Unnamed: 0,Metric,Value
0,accuracy,0.886212
1,precision,0.868178
2,recall,0.821396
3,f1,0.844139


In [20]:
report_opt_grad_boost_df.to_csv('results/optimized_grad_boost_3_feats.csv',index=False)

#### 3 feats + children and babies mixed

In [51]:
col = df[['children','babies']].sum(axis=1)
df['children_babies'] = col
df

Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,...,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,previous_cancellation_ratio,total_guests,total_nights,children_babies
0,0,342,27,1,0,0,2,0,0,0,...,0,0,0,0,1,0,0.0,2,0,0
1,0,737,27,1,0,0,2,0,0,0,...,0,0,0,0,1,0,0.0,2,0,0
2,0,7,27,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0.0,1,1,0
3,0,13,27,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0.0,1,1,0
4,0,14,27,1,0,2,2,0,0,0,...,0,0,0,0,1,0,0.0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118721,0,23,35,30,2,5,2,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0
118722,0,102,35,31,2,5,3,0,0,0,...,0,0,0,0,1,0,0.0,3,7,0
118723,0,34,35,31,2,5,2,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0
118724,0,109,35,31,2,5,2,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0


In [22]:
from sklearn.model_selection import train_test_split

labels = df.iloc[:,0]
features = df.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)
print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

(94980, 86) (23746, 86) (94980,) (23746,)


In [23]:
# Optimized model.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

grad_boost = GradientBoostingClassifier(learning_rate=0.2,n_estimators=300,subsample=0.8,max_depth=11)

grad_boost.fit(X_train,y_train)
y_pred = grad_boost.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

0.8858334035205929 0.8660366213821619 0.822968118545128 0.8439532608069994


In [24]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_opt_grad_boost_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_opt_grad_boost_df

Unnamed: 0,Metric,Value
0,accuracy,0.885833
1,precision,0.866037
2,recall,0.822968
3,f1,0.843953


In [25]:
report_opt_grad_boost_df.to_csv('results/optimized_grad_boost_3_feats_child_babies.csv',index=False)

#### 3 feats + children and babies mixed + reserved room type dropped

In [52]:
df = df.drop(['reserved_room_type_B','reserved_room_type_C','reserved_room_type_D','reserved_room_type_E'
             ,'reserved_room_type_F','reserved_room_type_G','reserved_room_type_H','reserved_room_type_L'
             ,'reserved_room_type_P'],axis=1)
df

Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,...,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,previous_cancellation_ratio,total_guests,total_nights,children_babies
0,0,342,27,1,0,0,2,0,0,0,...,0,0,0,0,1,0,0.0,2,0,0
1,0,737,27,1,0,0,2,0,0,0,...,0,0,0,0,1,0,0.0,2,0,0
2,0,7,27,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0.0,1,1,0
3,0,13,27,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0.0,1,1,0
4,0,14,27,1,0,2,2,0,0,0,...,0,0,0,0,1,0,0.0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118721,0,23,35,30,2,5,2,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0
118722,0,102,35,31,2,5,3,0,0,0,...,0,0,0,0,1,0,0.0,3,7,0
118723,0,34,35,31,2,5,2,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0
118724,0,109,35,31,2,5,2,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0


In [28]:
from sklearn.model_selection import train_test_split

labels = df.iloc[:,0]
features = df.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)
print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

(94980, 77) (23746, 77) (94980,) (23746,)


In [29]:
# Optimized model.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

grad_boost = GradientBoostingClassifier(learning_rate=0.2,n_estimators=300,subsample=0.8,max_depth=11)

grad_boost.fit(X_train,y_train)
y_pred = grad_boost.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

0.8855386170302366 0.8660122989593189 0.8220700493938033 0.8434692467173462


In [30]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_opt_grad_boost_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_opt_grad_boost_df

Unnamed: 0,Metric,Value
0,accuracy,0.885539
1,precision,0.866012
2,recall,0.82207
3,f1,0.843469


In [31]:
report_opt_grad_boost_df.to_csv('results/optimized_grad_boost_3_feats_child_babies_reser_dropped.csv',index=False)

#### 3 feats + children and babies mixed + reserved room type dropped + indicator if reserved and assigned room types are different + is family indicator and drop adults and children_babies

In [54]:
import numpy as np

conditions = [(df['adults'] > 0) & (df['children_babies'] > 0)]
choices = [1]
col = np.select(conditions, choices, default=0)

df = df.drop(['adults','children','babies','children_babies'],axis=1)
df['is_family'] = col
df

Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,...,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,previous_cancellation_ratio,total_guests,total_nights,is_family
0,0,342,27,1,0,0,0,0,3,0,...,0,0,0,0,1,0,0.0,2,0,0
1,0,737,27,1,0,0,0,0,4,0,...,0,0,0,0,1,0,0.0,2,0,0
2,0,7,27,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0.0,1,1,0
3,0,13,27,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0.0,1,1,0
4,0,14,27,1,0,2,0,0,0,0,...,0,0,0,0,1,0,0.0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118721,0,23,35,30,2,5,0,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0
118722,0,102,35,31,2,5,0,0,0,0,...,0,0,0,0,1,0,0.0,3,7,0
118723,0,34,35,31,2,5,0,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0
118724,0,109,35,31,2,5,0,0,0,0,...,0,0,0,0,1,0,0.0,2,7,0


In [55]:
from sklearn.model_selection import train_test_split

labels = df.iloc[:,0]
features = df.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)
print(X_train.shape,X_test.shape, y_train.shape, y_test.shape)

(94980, 74) (23746, 74) (94980,) (23746,)


In [56]:
# Optimized model.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (classification_report,accuracy_score, precision_score, recall_score, f1_score)

grad_boost = GradientBoostingClassifier(learning_rate=0.2,n_estimators=300,subsample=0.8,max_depth=11)

grad_boost.fit(X_train,y_train)
y_pred = grad_boost.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print(accuracy,precision,recall,f1)

0.8854543923187063 0.8643429109750354 0.8239784463403682 0.8436781609195403


In [57]:
data = [['accuracy', accuracy],['precision', precision],['recall', recall],['f1', f1]] 
report_opt_grad_boost_df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
report_opt_grad_boost_df

Unnamed: 0,Metric,Value
0,accuracy,0.885454
1,precision,0.864343
2,recall,0.823978
3,f1,0.843678


In [58]:
report_opt_grad_boost_df.to_csv('results/optimized_grad_boost_3_feats_reser_dropped_is_family.csv',index=False)