In [13]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import JamesSteinEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from feature_engine.encoding import RareLabelEncoder
import joblib as jb

In [67]:
df = pd.read_csv('hotel_bookings.csv')

In [68]:
df = df[~((df['arrival_date_year'] == 2015) & (df['arrival_date_month'] == 'July') | (df['arrival_date_year'] == 2017) &
         (df['arrival_date_month'] == 'August'))]
df.shape

(111689, 32)

In [69]:
df = df[~((df['children'] == 0) & (df['adults'] == 0) & (df['babies'] == 0))]
df.shape

(111513, 32)

In [70]:
df.drop(columns= ['company', 'agent', 'reservation_status_date', 'assigned_room_type', 'reservation_status',
                  'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], 
        inplace= True)
df.shape

(111513, 24)

In [71]:
df.dropna(inplace= True)
df.shape

(111044, 24)

In [72]:
df.duplicated().sum()

31653

In [73]:
df['is_canceled'].value_counts(normalize= True).mul(100).round(2)

0    63.02
1    36.98
Name: is_canceled, dtype: float64

In [75]:
df_categorical = df.select_dtypes('object')
df_numerical = df.select_dtypes('number')

Unnamed: 0,hotel,meal,country,market_segment,distribution_channel,reserved_room_type,deposit_type,customer_type
82162,City Hotel,BB,PRT,Corporate,Corporate,A,No Deposit,Transient-Party
12953,Resort Hotel,HB,PRT,Offline TA/TO,TA/TO,E,No Deposit,Contract
31360,Resort Hotel,HB,GBR,Direct,Direct,A,No Deposit,Transient
115771,City Hotel,BB,GBR,Online TA,TA/TO,F,No Deposit,Transient
63645,City Hotel,BB,PRT,Online TA,TA/TO,G,No Deposit,Transient


In [76]:
df_categorical.shape

(111044, 8)

In [78]:
df_numerical.shape

(111044, 16)

In [79]:
X = df.drop(['is_canceled'], axis= 1)
y = df['is_canceled']

categorical_variables = X.select_dtypes('object').columns.to_list()
numerical_variables = X.select_dtypes('number').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 4)

In [81]:
df['country'].value_counts(dropna=False, normalize= True).shape

(175,)

In [82]:
df['country'].value_counts(dropna=False, normalize= True)[0:22].sum()

0.9496055617593026

In [83]:
df['country'].value_counts(dropna=False, normalize= True)[0:22].tail()

ISR    0.005809
RUS    0.005286
NOR    0.005169
ROU    0.004097
FIN    0.003908
Name: country, dtype: float64

In [84]:
rare_encoder = RareLabelEncoder(tol= 0.003900, variables= ['country'], replace_with= 'Other Country')
rare_encoder.fit(X_train)
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

X_train['country'].value_counts(dropna=False, normalize= True).shape

(23,)

In [85]:
X_train['country'].value_counts(dropna=False, normalize= True)

PRT              0.404468
GBR              0.103536
FRA              0.088509
ESP              0.070565
DEU              0.062958
Other Country    0.051063
ITA              0.030780
IRL              0.028871
BEL              0.019973
BRA              0.019046
NLD              0.018280
USA              0.016694
CHE              0.014422
CN               0.010833
AUT              0.010753
SWE              0.008710
CHN              0.008683
POL              0.008038
ISR              0.005497
RUS              0.005282
NOR              0.004839
ROU              0.004113
FIN              0.004086
Name: country, dtype: float64

In [86]:
X_train[numerical_variables].head()

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
65992,121,15,0,4,3,0.0,0,0,0,0,0,0,173.25,0,0
110968,175,18,1,3,2,0.0,0,0,0,0,0,0,99.0,0,2
14150,4,50,0,1,1,0.0,0,0,0,5,0,0,27.0,0,0
116903,88,30,1,2,2,0.0,0,0,0,0,0,0,125.0,0,1
42135,10,36,2,5,2,0.0,0,0,0,0,0,0,85.0,0,0


In [87]:
categorical_variables

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [88]:
country_variable = [categorical_variables[2]]
country_variable

['country']

In [90]:
other_variables = []
for i in categorical_variables:
    if i not in ['country', 'reserved_room_type']:
        other_variables.append(i)
    else:
        pass
        
other_variables

['hotel',
 'meal',
 'market_segment',
 'distribution_channel',
 'deposit_type',
 'customer_type']

### GaussianNB

In [24]:
transformer = ColumnTransformer([('jmss', JamesSteinEncoder(), categorical_variables),
                                 ('pwr', PowerTransformer(), numerical_variables ),
                                 ('stdscaler', StandardScaler(), numerical_variables)])

gnb = GaussianNB(var_smoothing= 1.5)

pipe = Pipeline([('transformer', transformer), ('model', gnb)])

gnb_cv = cross_validate(pipe, X_train, y_train, cv= 10, n_jobs= -1, 
                        scoring= ['f1', 'precision', 'recall'], return_train_score= True)

In [25]:
gnb_cv_df = pd.DataFrame(gnb_cv)
gnb_cv_df.mean()

fit_time           3.016941
score_time         0.102267
test_f1            0.662546
train_f1           0.662266
test_precision     0.582817
train_precision    0.582456
test_recall        0.767781
train_recall       0.767551
dtype: float64

In [26]:
pipe.fit(X_train, y_train)

print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.68      0.75     23067
           1       0.59      0.78      0.67     13578

    accuracy                           0.71     36645
   macro avg       0.71      0.73      0.71     36645
weighted avg       0.74      0.71      0.72     36645



In [27]:
jb.dump(value= pipe, filename= 'gaussian_nb_model')

['gaussian_nb_model']

### AdaBoostClassifier | Estimador Base: Gaussian Naive Bayes

In [64]:
transformer = ColumnTransformer([('jmss', JamesSteinEncoder(), categorical_variables),
                                 ('pwr', PowerTransformer(), numerical_variables ),
                                 ('stdscaler', StandardScaler(), numerical_variables)])

ada_gnb = AdaBoostClassifier(GaussianNB(var_smoothing= 1.5), n_estimators= 100, random_state=4)

pipe = Pipeline([('transformer', transformer), ('model', ada_gnb)])

param_grid = {'model__learning_rate': np.arange(1, 1.5, 0.05)}   
    
rscv_ada_gnb = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, scoring= ['recall', 'precision', 'f1', 'roc_auc'], 
                             refit= 'f1', return_train_score= True, random_state= 4)

rscv_ada_gnb.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_ada_gnb.cv_results_)

In [65]:
pd.set_option('max.colwidth', None)
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
1,{'model__learning_rate': 1.05},0.64,0.64,0.41,0.41,0.44,0.44,0.63,0.63
3,{'model__learning_rate': 1.1500000000000001},0.64,0.64,0.43,0.44,0.42,0.42,0.6,0.6
8,{'model__learning_rate': 1.4000000000000004},0.55,0.55,0.37,0.37,0.37,0.37,0.54,0.54
9,{'model__learning_rate': 1.4500000000000004},0.52,0.52,0.46,0.45,0.36,0.36,0.57,0.57
5,{'model__learning_rate': 1.2500000000000002},0.46,0.47,0.49,0.5,0.36,0.36,0.64,0.64
2,{'model__learning_rate': 1.1},0.4,0.4,0.49,0.39,0.31,0.31,0.6,0.6
0,{'model__learning_rate': 1.0},0.46,0.45,0.5,0.51,0.3,0.3,0.62,0.62
7,{'model__learning_rate': 1.3500000000000003},0.41,0.41,0.41,0.42,0.29,0.29,0.57,0.57
6,{'model__learning_rate': 1.3000000000000003},0.42,0.42,0.47,0.47,0.29,0.29,0.59,0.59
4,{'model__learning_rate': 1.2000000000000002},0.31,0.31,0.4,0.41,0.27,0.28,0.55,0.56


In [66]:
print(rscv_ada_gnb.best_params_)
print('')

y_pred = rscv_ada_gnb.predict(X_test)
print(classification_report(y_test, y_pred))

{'model__learning_rate': 1.05}

              precision    recall  f1-score   support

           0       0.81      0.53      0.64     23067
           1       0.50      0.79      0.61     13578

    accuracy                           0.63     36645
   macro avg       0.65      0.66      0.63     36645
weighted avg       0.69      0.63      0.63     36645



In [69]:
jb.dump(value= rscv_ada_gnb, filename= 'ada_gnb_model')

['ada_gnb_model']