In [18]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import JamesSteinEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import joblib as jb

In [67]:
df = pd.read_csv('hotel_bookings.csv')

In [68]:
df = df[~((df['arrival_date_year'] == 2015) & (df['arrival_date_month'] == 'July') | (df['arrival_date_year'] == 2017) &
         (df['arrival_date_month'] == 'August'))]
df.shape

(111689, 32)

In [69]:
df = df[~((df['children'] == 0) & (df['adults'] == 0) & (df['babies'] == 0))]
df.shape

(111513, 32)

In [70]:
df.drop(columns= ['company', 'agent', 'reservation_status_date', 'assigned_room_type', 'reservation_status',
                  'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], 
        inplace= True)
df.shape

(111513, 24)

In [71]:
df.dropna(inplace= True)
df.shape

(111044, 24)

In [72]:
df.duplicated().sum()

31653

In [73]:
df['is_canceled'].value_counts(normalize= True).mul(100).round(2)

0    63.02
1    36.98
Name: is_canceled, dtype: float64

In [75]:
df_categorical = df.select_dtypes('object')
df_numerical = df.select_dtypes('number')

Unnamed: 0,hotel,meal,country,market_segment,distribution_channel,reserved_room_type,deposit_type,customer_type
82162,City Hotel,BB,PRT,Corporate,Corporate,A,No Deposit,Transient-Party
12953,Resort Hotel,HB,PRT,Offline TA/TO,TA/TO,E,No Deposit,Contract
31360,Resort Hotel,HB,GBR,Direct,Direct,A,No Deposit,Transient
115771,City Hotel,BB,GBR,Online TA,TA/TO,F,No Deposit,Transient
63645,City Hotel,BB,PRT,Online TA,TA/TO,G,No Deposit,Transient


In [76]:
df_categorical.shape

(111044, 8)

In [78]:
df_numerical.shape

(111044, 16)

In [79]:
X = df.drop(['is_canceled'], axis= 1)
y = df['is_canceled']

categorical_variables = X.select_dtypes('object').columns.to_list()
numerical_variables = X.select_dtypes('number').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 4)

In [86]:
X_train[numerical_variables].head()

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
65992,121,15,0,4,3,0.0,0,0,0,0,0,0,173.25,0,0
110968,175,18,1,3,2,0.0,0,0,0,0,0,0,99.0,0,2
14150,4,50,0,1,1,0.0,0,0,0,5,0,0,27.0,0,0
116903,88,30,1,2,2,0.0,0,0,0,0,0,0,125.0,0,1
42135,10,36,2,5,2,0.0,0,0,0,0,0,0,85.0,0,0


In [87]:
categorical_variables

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [88]:
country_variable = [categorical_variables[2]]
country_variable

['country']

In [90]:
other_variables = []
for i in categorical_variables:
    if i not in ['country', 'reserved_room_type']:
        other_variables.append(i)
    else:
        pass
        
other_variables

['hotel',
 'meal',
 'market_segment',
 'distribution_channel',
 'deposit_type',
 'customer_type']

#### Testando combinação de  diferentes encoders.

In [91]:
def encoders_model_results_tree(encoders_cat, encoders_country, model_):
        
    def differentencoders_model_results(x, y):
        other_variables_pipe = Pipeline([('encoder_other_variables', x)])
        country_pipe = Pipeline([('encoder_country', y)])
        num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                             ('num_std_scaler', StandardScaler())])
        
        transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                         ('country_transf', country_pipe, country_variable), 
                                         ('num_transf', num_pipe, numerical_variables)])
        
        X_train_transformed = transformer.fit_transform(X_train, y_train)
        X_test_transformed = transformer.transform(X_test)
    
        cv = cross_validate(model_, X_train_transformed, y_train, cv= 10, scoring= ('recall', 'precision', 'f1'), 
                               n_jobs= -1)
        
        print(model_)
        print('Recall:', cv['test_recall'].mean().round(2), 'Precision:', 
              cv['test_precision'].mean().round(2), 'F1-Score:', cv['test_f1'].mean().round(2))
        print('')
    
    for i in encoders_cat:
        for j in encoders_country:
            print(i, ' - other variables      ', j, ' - country variable')                    
            differentencoders_model_results(i, j)

In [92]:
others = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]
country = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]

#Árvore de Decisão
encoders_model_results_tree(others, country, DecisionTreeClassifier(random_state= 4))

OneHotEncoder(handle_unknown='ignore')  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
LogisticRegression()
Recall: 0.68 Precision: 0.82 F1-Score: 0.74

OneHotEncoder(handle_unknown='ignore')  - other variables       JamesSteinEncoder()  - country variable
LogisticRegression()
Recall: 0.67 Precision: 0.82 F1-Score: 0.74

JamesSteinEncoder()  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
LogisticRegression()
Recall: 0.64 Precision: 0.82 F1-Score: 0.72

JamesSteinEncoder()  - other variables       JamesSteinEncoder()  - country variable
LogisticRegression()
Recall: 0.64 Precision: 0.82 F1-Score: 0.72



### Decision Tree

#### OHE

In [12]:
transformer = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown= 'ignore'), categorical_variables), 
                                  ('stdscaler', StandardScaler(), numerical_variables)])

tree = DecisionTreeClassifier(random_state=4)

pipe = Pipeline([('transformer', transformer), ('model', tree)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__min_samples_leaf':[1,2,3,4,5], 
              'model__class_weight':[{0:x, 1:1-x} for x in weights]}   
    
rscv_tree = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, scoring= ['recall', 'precision', 'f1', 'roc_auc'], 
                             refit= 'f1', return_train_score= True, random_state= 4)

rscv_tree.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_tree.cv_results_)

In [13]:
pd.set_option('max.colwidth', None)
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
7,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.6450000000000005, 1: 0.35499999999999954}}",0.91,0.76,0.98,0.8,0.95,0.78,1.0,0.85
6,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4250000000000002, 1: 0.5749999999999997}}",0.98,0.81,0.92,0.75,0.95,0.78,1.0,0.85
9,"{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}",0.99,0.79,1.0,0.77,0.99,0.78,1.0,0.83
5,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4100000000000002, 1: 0.5899999999999999}}",0.98,0.81,0.92,0.75,0.95,0.78,1.0,0.85
2,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.5450000000000004, 1: 0.4549999999999996}}",0.92,0.75,0.98,0.8,0.95,0.78,1.0,0.85
8,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.35500000000000015, 1: 0.6449999999999998}}",0.95,0.81,0.87,0.74,0.91,0.78,0.99,0.88
4,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.34500000000000014, 1: 0.6549999999999998}}",0.98,0.8,0.92,0.75,0.95,0.78,1.0,0.85
3,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.7850000000000006, 1: 0.2149999999999994}}",0.88,0.73,1.0,0.82,0.93,0.78,1.0,0.85
0,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.6200000000000003, 1: 0.37999999999999967}}",0.84,0.73,0.95,0.82,0.9,0.77,0.99,0.88
1,"{'model__min_samples_leaf': 3, 'model__class_weight': {0: 0.7300000000000004, 1: 0.2699999999999996}}",0.83,0.71,0.98,0.84,0.9,0.77,0.99,0.87


In [14]:
print(rscv_tree.best_params_)
print('')

y_pred = rscv_tree.predict(X_test)
print(classification_report(y_test, y_pred))

{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.6450000000000005, 1: 0.35499999999999954}}

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     23067
           1       0.81      0.77      0.79     13578

    accuracy                           0.85     36645
   macro avg       0.84      0.83      0.83     36645
weighted avg       0.85      0.85      0.85     36645



####  OHE + JamesStein 

In [19]:
transformer = ColumnTransformer([('other_variables_transf', OneHotEncoder(handle_unknown= 'ignore'), other_variables),
                                 ('country_transf', JamesSteinEncoder(), country_variable),
                                 ('stdscaler', StandardScaler(), numerical_variables)])

tree = DecisionTreeClassifier(random_state=4)

pipe = Pipeline([('transformer', transformer), ('model', tree)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__min_samples_leaf':[1,2,3,4,5], 
              'model__class_weight':[{0:x, 1:1-x} for x in weights]}   
    
rscv_tree_jmss = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, scoring= ['recall', 'precision', 'f1', 'roc_auc'], 
                             refit= 'f1', return_train_score= True, random_state= 4)

rscv_tree_jmss.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_tree_jmss.cv_results_)

In [20]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
9,"{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}",0.99,0.79,1.0,0.77,0.99,0.78,1.0,0.83
8,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.35500000000000015, 1: 0.6449999999999998}}",0.95,0.81,0.87,0.75,0.91,0.78,0.99,0.88
4,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.34500000000000014, 1: 0.6549999999999998}}",0.98,0.8,0.93,0.75,0.95,0.78,1.0,0.85
6,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4250000000000002, 1: 0.5749999999999997}}",0.98,0.8,0.93,0.75,0.95,0.78,1.0,0.85
2,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.5450000000000004, 1: 0.4549999999999996}}",0.92,0.76,0.98,0.8,0.95,0.78,1.0,0.85
7,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.6450000000000005, 1: 0.35499999999999954}}",0.92,0.76,0.98,0.8,0.95,0.78,1.0,0.85
3,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.7850000000000006, 1: 0.2149999999999994}}",0.88,0.74,1.0,0.82,0.93,0.77,1.0,0.85
0,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.6200000000000003, 1: 0.37999999999999967}}",0.85,0.73,0.96,0.82,0.9,0.77,0.99,0.88
5,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4100000000000002, 1: 0.5899999999999999}}",0.98,0.8,0.93,0.75,0.95,0.77,1.0,0.85
1,"{'model__min_samples_leaf': 3, 'model__class_weight': {0: 0.7300000000000004, 1: 0.2699999999999996}}",0.84,0.71,0.98,0.83,0.9,0.77,0.99,0.87


In [21]:
print(rscv_tree_jmss.best_params_)
print('')

y_pred = rscv_tree_jmss.predict(X_test)

print(classification_report(y_test, y_pred))

pd.reset_option('max.colwidth')

{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}

              precision    recall  f1-score   support

           0       0.88      0.87      0.87     23067
           1       0.78      0.80      0.79     13578

    accuracy                           0.84     36645
   macro avg       0.83      0.83      0.83     36645
weighted avg       0.84      0.84      0.84     36645

