In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import JamesSteinEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from feature_engine.encoding import RareLabelEncoder
import joblib as jb

In [67]:
df = pd.read_csv('hotel_bookings.csv')

In [68]:
df = df[~((df['arrival_date_year'] == 2015) & (df['arrival_date_month'] == 'July') | (df['arrival_date_year'] == 2017) &
         (df['arrival_date_month'] == 'August'))]
df.shape

(111689, 32)

In [69]:
df = df[~((df['children'] == 0) & (df['adults'] == 0) & (df['babies'] == 0))]
df.shape

(111513, 32)

In [70]:
df.drop(columns= ['company', 'agent', 'reservation_status_date', 'assigned_room_type', 'reservation_status',
                  'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], 
        inplace= True)
df.shape

(111513, 24)

In [71]:
df.dropna(inplace= True)
df.shape

(111044, 24)

In [72]:
df.duplicated().sum()

31653

In [73]:
df['is_canceled'].value_counts(normalize= True).mul(100).round(2)

0    63.02
1    36.98
Name: is_canceled, dtype: float64

In [75]:
df_categorical = df.select_dtypes('object')
df_numerical = df.select_dtypes('number')

Unnamed: 0,hotel,meal,country,market_segment,distribution_channel,reserved_room_type,deposit_type,customer_type
82162,City Hotel,BB,PRT,Corporate,Corporate,A,No Deposit,Transient-Party
12953,Resort Hotel,HB,PRT,Offline TA/TO,TA/TO,E,No Deposit,Contract
31360,Resort Hotel,HB,GBR,Direct,Direct,A,No Deposit,Transient
115771,City Hotel,BB,GBR,Online TA,TA/TO,F,No Deposit,Transient
63645,City Hotel,BB,PRT,Online TA,TA/TO,G,No Deposit,Transient


In [76]:
df_categorical.shape

(111044, 8)

In [78]:
df_numerical.shape

(111044, 16)

In [79]:
X = df.drop(['is_canceled'], axis= 1)
y = df['is_canceled']

categorical_variables = X.select_dtypes('object').columns.to_list()
numerical_variables = X.select_dtypes('number').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 4)

In [81]:
df['country'].value_counts(dropna=False, normalize= True).shape

(175,)

In [82]:
df['country'].value_counts(dropna=False, normalize= True)[0:22].sum()

0.9496055617593026

In [83]:
df['country'].value_counts(dropna=False, normalize= True)[0:22].tail()

ISR    0.005809
RUS    0.005286
NOR    0.005169
ROU    0.004097
FIN    0.003908
Name: country, dtype: float64

In [84]:
rare_encoder = RareLabelEncoder(tol= 0.003900, variables= ['country'], replace_with= 'Other Country')
rare_encoder.fit(X_train)
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

X_train['country'].value_counts(dropna=False, normalize= True).shape

(23,)

In [85]:
X_train['country'].value_counts(dropna=False, normalize= True)

PRT              0.404468
GBR              0.103536
FRA              0.088509
ESP              0.070565
DEU              0.062958
Other Country    0.051063
ITA              0.030780
IRL              0.028871
BEL              0.019973
BRA              0.019046
NLD              0.018280
USA              0.016694
CHE              0.014422
CN               0.010833
AUT              0.010753
SWE              0.008710
CHN              0.008683
POL              0.008038
ISR              0.005497
RUS              0.005282
NOR              0.004839
ROU              0.004113
FIN              0.004086
Name: country, dtype: float64

In [86]:
X_train[numerical_variables].head()

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
65992,121,15,0,4,3,0.0,0,0,0,0,0,0,173.25,0,0
110968,175,18,1,3,2,0.0,0,0,0,0,0,0,99.0,0,2
14150,4,50,0,1,1,0.0,0,0,0,5,0,0,27.0,0,0
116903,88,30,1,2,2,0.0,0,0,0,0,0,0,125.0,0,1
42135,10,36,2,5,2,0.0,0,0,0,0,0,0,85.0,0,0


In [87]:
categorical_variables

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [88]:
country_variable = [categorical_variables[2]]
country_variable

['country']

In [90]:
other_variables = []
for i in categorical_variables:
    if i not in ['country', 'reserved_room_type']:
        other_variables.append(i)
    else:
        pass
        
other_variables

['hotel',
 'meal',
 'market_segment',
 'distribution_channel',
 'deposit_type',
 'customer_type']

### Random Forest

#### OHE

In [26]:
pd.set_option('max_colwidth', None)

other_variables_pipe = Pipeline([('encoder_other_variables', OneHotEncoder(handle_unknown= 'ignore'))])
country_pipe = Pipeline([('encoder_country', OneHotEncoder(handle_unknown= 'ignore'))]) 
num_pipe = Pipeline([('num', StandardScaler())])
   
transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables), 
                                 ('country_transf', country_pipe, country_variable), 
                                 ('num_transf', num_pipe, numerical_variables)])

rf = RandomForestClassifier(n_estimators= 100, random_state= 4)

pipe = Pipeline([('transformer', transformer), ('model', rf)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__min_samples_leaf':[1,2,3,4,5,6,7,8,9,10], 
              'model__class_weight':[{0:x, 1:1-x} for x in weights]}

rscv_rf = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, 
                    scoring= ['recall', 'precision', 'f1', 'roc_auc'], refit= 'f1', return_train_score= True, random_state= 4)

rscv_rf.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_rf.cv_results_)

In [27]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
4,"{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.38000000000000017, 1: 0.6199999999999999}}",1.0,0.79,0.99,0.87,0.99,0.83,1.0,0.95
7,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.3950000000000002, 1: 0.6049999999999998}}",0.86,0.8,0.87,0.83,0.87,0.82,0.97,0.94
1,"{'model__min_samples_leaf': 5, 'model__class_weight': {0: 0.2850000000000001, 1: 0.7149999999999999}}",0.92,0.87,0.8,0.76,0.86,0.81,0.97,0.94
5,"{'model__min_samples_leaf': 7, 'model__class_weight': {0: 0.4250000000000002, 1: 0.5749999999999997}}",0.81,0.78,0.86,0.84,0.84,0.81,0.96,0.94
2,"{'model__min_samples_leaf': 10, 'model__class_weight': {0: 0.4150000000000002, 1: 0.5849999999999997}}",0.81,0.78,0.85,0.83,0.83,0.81,0.95,0.93
9,"{'model__min_samples_leaf': 10, 'model__class_weight': {0: 0.25000000000000006, 1: 0.75}}",0.93,0.9,0.72,0.7,0.81,0.79,0.95,0.93
3,"{'model__min_samples_leaf': 10, 'model__class_weight': {0: 0.5500000000000003, 1: 0.44999999999999973}}",0.71,0.69,0.91,0.89,0.79,0.78,0.95,0.93
6,"{'model__min_samples_leaf': 9, 'model__class_weight': {0: 0.6050000000000004, 1: 0.3949999999999996}}",0.67,0.65,0.94,0.92,0.78,0.76,0.95,0.93
8,"{'model__min_samples_leaf': 3, 'model__class_weight': {0: 0.7400000000000004, 1: 0.25999999999999956}}",0.66,0.6,0.99,0.95,0.79,0.74,0.98,0.94
0,"{'model__min_samples_leaf': 7, 'model__class_weight': {0: 0.7700000000000005, 1: 0.22999999999999954}}",0.51,0.49,1.0,0.98,0.68,0.66,0.96,0.94


In [29]:
print(rscv_rf.best_params_)
print('')

y_pred = rscv_rf.predict(X_test)

print(classification_report(y_test, y_pred))

{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.38000000000000017, 1: 0.6199999999999999}}

              precision    recall  f1-score   support

           0       0.89      0.93      0.91     23067
           1       0.87      0.80      0.83     13578

    accuracy                           0.88     36645
   macro avg       0.88      0.87      0.87     36645
weighted avg       0.88      0.88      0.88     36645



#### OHE + JamesStein

In [26]:
other_variables_pipe = Pipeline([('encoder_other_variables', OneHotEncoder(handle_unknown= 'ignore'))])
country_pipe = Pipeline([('encoder_country', JamesSteinEncoder())])
num_pipe = Pipeline([('num', StandardScaler())])
   
transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables), 
                                 ('country_transf', country_pipe, country_variable), 
                                 ('num_transf', num_pipe, numerical_variables)])

rf = RandomForestClassifier(n_estimators= 100, random_state= 4)

pipe = Pipeline([('transformer', transformer), ('model', rf)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__min_samples_leaf':[1,2,3,4,5,6,7,8,9,10], 
              'model__class_weight':[{0:x, 1:1-x} for x in weights]}

rscv_ohe_jss = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, 
                    scoring= ['recall', 'precision', 'f1', 'roc_auc'], refit= 'f1', return_train_score= True, random_state= 4)

rscv_ohe_jss.fit(X_train, y_train)

end=time.time()

best_results = pd.DataFrame(rscv_ohe_jss.cv_results_)

In [27]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
9,"{'model__min_samples_leaf': 6, 'model__class_w...",0.88,0.82,0.78,0.74,0.82,0.78,0.95,0.91
1,"{'model__min_samples_leaf': 3, 'model__class_w...",0.96,0.87,0.77,0.7,0.85,0.78,0.97,0.92
6,"{'model__min_samples_leaf': 8, 'model__class_w...",0.76,0.73,0.85,0.82,0.8,0.77,0.94,0.91
2,"{'model__min_samples_leaf': 2, 'model__class_w...",0.75,0.66,0.98,0.91,0.85,0.76,0.98,0.92
4,"{'model__min_samples_leaf': 8, 'model__class_w...",0.71,0.69,0.87,0.86,0.78,0.76,0.94,0.91
8,"{'model__min_samples_leaf': 10, 'model__class_...",0.9,0.86,0.7,0.68,0.79,0.76,0.93,0.91
0,"{'model__min_samples_leaf': 8, 'model__class_w...",0.68,0.66,0.89,0.87,0.77,0.75,0.94,0.91
3,"{'model__min_samples_leaf': 6, 'model__class_w...",0.68,0.66,0.9,0.88,0.77,0.75,0.94,0.91
7,"{'model__min_samples_leaf': 6, 'model__class_w...",0.66,0.64,0.91,0.89,0.77,0.74,0.95,0.91
5,"{'model__min_samples_leaf': 8, 'model__class_w...",0.64,0.62,0.91,0.89,0.75,0.73,0.94,0.91


In [29]:
print(rscv_ohe_jss.best_params_)
print('')

y_pred = rscv_ohe_jss.predict(X_test)

print(classification_report(y_test, y_pred))

pd.reset_option('max.colwidth')

{'model__min_samples_leaf': 6, 'model__class_weight': {0: 0.2900000000000001, 1: 0.71}}

              precision    recall  f1-score   support

           0       0.90      0.83      0.86     23067
           1       0.75      0.83      0.79     13578

    accuracy                           0.83     36645
   macro avg       0.82      0.83      0.83     36645
weighted avg       0.84      0.83      0.84     36645



In [30]:
jb.dump(rscv_rf, 'random_forest_model')

['random_forest_model']

### Análise das Features  do modelo 

In [458]:
feature_importance = pd.Series(rscv_ohe.best_estimator_['model'].feature_importances_, 
                               index= rscv_ohe.best_estimator_['transformer'].get_feature_names_out())
feature_importance = pd.DataFrame(feature_importance.sort_values(ascending= False).round(2), columns= ['Importance'])
feature_importance.style.bar(color=['red', 'green'])

Unnamed: 0,Importance
num_transf__lead_time,0.15
num_transf__adr,0.1
other_variables_transf__deposit_type_No Deposit,0.09
num_transf__arrival_date_week_number,0.09
other_variables_transf__deposit_type_Non Refund,0.08
country_transf__country_PRT,0.06
num_transf__total_of_special_requests,0.06
num_transf__stays_in_week_nights,0.05
num_transf__previous_cancellations,0.03
num_transf__stays_in_weekend_nights,0.03
