In [1]:
import numpy as np
import pandas as pd

#Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns', None)

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:\\Users\\yasha\\OneDrive\\Desktop\\Deployment\\bookings_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,country,market_segment,distribution_channel,is_repeated_guest,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,guests,room,net_canceled
0,0,0,0,296.0,135,3,1,0,3,0,0,0,2,0.0,0,0,2,1,0
1,1,0,0,296.0,135,3,1,0,4,0,0,0,2,0.0,0,0,2,1,0
2,2,0,0,7.0,59,3,1,0,0,0,0,0,2,75.0,0,0,1,0,0
3,3,0,0,13.0,59,2,0,0,0,0,304,0,2,75.0,0,0,1,1,0
4,4,0,0,14.0,59,6,3,0,0,0,240,0,2,98.0,0,1,2,1,0


In [4]:
df.columns

Index(['Unnamed: 0', 'hotel', 'is_canceled', 'lead_time', 'country',
       'market_segment', 'distribution_channel', 'is_repeated_guest',
       'booking_changes', 'deposit_type', 'agent', 'days_in_waiting_list',
       'customer_type', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests', 'guests', 'room', 'net_canceled'],
      dtype='object')

In [6]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,country,market_segment,distribution_channel,is_repeated_guest,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,guests,room,net_canceled
0,0,0,296.0,135,3,1,0,3,0,0,0,2,0.0,0,0,2,1,0
1,0,0,296.0,135,3,1,0,4,0,0,0,2,0.0,0,0,2,1,0
2,0,0,7.0,59,3,1,0,0,0,0,0,2,75.0,0,0,1,0,0
3,0,0,13.0,59,2,0,0,0,0,304,0,2,75.0,0,0,1,1,0
4,0,0,14.0,59,6,3,0,0,0,240,0,2,98.0,0,1,2,1,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87204 entries, 0 to 87203
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   hotel                        87204 non-null  int64  
 1   is_canceled                  87204 non-null  int64  
 2   lead_time                    87204 non-null  float64
 3   country                      87204 non-null  int64  
 4   market_segment               87204 non-null  int64  
 5   distribution_channel         87204 non-null  int64  
 6   is_repeated_guest            87204 non-null  int64  
 7   booking_changes              87204 non-null  int64  
 8   deposit_type                 87204 non-null  int64  
 9   agent                        87204 non-null  int64  
 10  days_in_waiting_list         87204 non-null  int64  
 11  customer_type                87204 non-null  int64  
 12  adr                          87204 non-null  float64
 13  required_car_par

In [9]:
df.isnull().sum()

hotel                          0
is_canceled                    0
lead_time                      0
country                        0
market_segment                 0
distribution_channel           0
is_repeated_guest              0
booking_changes                0
deposit_type                   0
agent                          0
days_in_waiting_list           0
customer_type                  0
adr                            0
required_car_parking_spaces    0
total_of_special_requests      0
guests                         0
room                           0
net_canceled                   0
dtype: int64

## Handling Imbalance

In [10]:
X = df.drop('is_canceled', axis='columns')
y = df['is_canceled']
X.shape, y.shape

((87204, 17), (87204,))

In [11]:
#Divide by class
df_class_0 = df[df['is_canceled'] == 0]
df_class_1 = df[df['is_canceled'] == 1]
print(df_class_0.shape, df_class_1.shape)

(63196, 18) (24008, 18)


In [12]:
from imblearn.combine import SMOTETomek
smk = SMOTETomek(random_state=42, sampling_strategy = 0.5)
X_res, y_res = smk.fit_resample(X,y)

In [13]:
X_res.shape, y_res.shape

((85646, 17), (85646,))

## Train-Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.3, random_state = 15, stratify=y_res)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(59952, 17) (25694, 17) (59952,) (25694,)


## XGBoost Model

In [15]:
## Hyper Parameter Optimization
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]    
}

In [16]:
## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

In [17]:
model = xgboost.XGBClassifier()

In [18]:
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1,cv=5,verbose=3)

In [19]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                           reg_alpha=None, reg_lambda=None,
                                           scale_pos_weight=None,
                                       

In [20]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4,
              enable_categorical=False, gamma=0.2, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [21]:
model = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4,
              enable_categorical=False, gamma=0.2, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=12,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [22]:
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4,
              enable_categorical=False, gamma=0.2, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [23]:
model.score(X_train, y_train)

0.9194021884174006

In [24]:
model.score(X_test, y_test)

0.8745232349964972

In [26]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[16159  1428]
 [ 1796  6311]]
0.8745232349964972
              precision    recall  f1-score   support

           0       0.90      0.92      0.91     17587
           1       0.82      0.78      0.80      8107

    accuracy                           0.87     25694
   macro avg       0.86      0.85      0.85     25694
weighted avg       0.87      0.87      0.87     25694



In [27]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(model,X_train, y_train,cv=10)



In [28]:
score

array([0.86440961, 0.87525017, 0.86638866, 0.86738949, 0.86271893,
       0.87122602, 0.8707256 , 0.87155963, 0.87206005, 0.86588824])

In [29]:
score.mean()

0.868761640370756