In [66]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import JamesSteinEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from feature_engine.encoding import RareLabelEncoder
import joblib as jb

In [67]:
df = pd.read_csv('hotel_bookings.csv')

In [68]:
df = df[~((df['arrival_date_year'] == 2015) & (df['arrival_date_month'] == 'July') | (df['arrival_date_year'] == 2017) &
         (df['arrival_date_month'] == 'August'))]
df.shape

(111689, 32)

In [69]:
df = df[~((df['children'] == 0) & (df['adults'] == 0) & (df['babies'] == 0))]
df.shape

(111513, 32)

In [70]:
df.drop(columns= ['company', 'agent', 'reservation_status_date', 'assigned_room_type', 'reservation_status',
                  'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], 
        inplace= True)
df.shape

(111513, 24)

In [71]:
df.dropna(inplace= True)
df.shape

(111044, 24)

In [72]:
df.duplicated().sum()

31653

In [73]:
df['is_canceled'].value_counts(normalize= True).mul(100).round(2)

0    63.02
1    36.98
Name: is_canceled, dtype: float64

In [75]:
df_categorical = df.select_dtypes('object')
df_numerical = df.select_dtypes('number')

Unnamed: 0,hotel,meal,country,market_segment,distribution_channel,reserved_room_type,deposit_type,customer_type
82162,City Hotel,BB,PRT,Corporate,Corporate,A,No Deposit,Transient-Party
12953,Resort Hotel,HB,PRT,Offline TA/TO,TA/TO,E,No Deposit,Contract
31360,Resort Hotel,HB,GBR,Direct,Direct,A,No Deposit,Transient
115771,City Hotel,BB,GBR,Online TA,TA/TO,F,No Deposit,Transient
63645,City Hotel,BB,PRT,Online TA,TA/TO,G,No Deposit,Transient


In [76]:
df_categorical.shape

(111044, 8)

In [78]:
df_numerical.shape

(111044, 16)

In [79]:
X = df.drop(['is_canceled'], axis= 1)
y = df['is_canceled']

categorical_variables = X.select_dtypes('object').columns.to_list()
numerical_variables = X.select_dtypes('number').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 4)

In [81]:
df['country'].value_counts(dropna=False, normalize= True).shape

(175,)

In [82]:
df['country'].value_counts(dropna=False, normalize= True)[0:22].sum()

0.9496055617593026

In [83]:
df['country'].value_counts(dropna=False, normalize= True)[0:22].tail()

ISR    0.005809
RUS    0.005286
NOR    0.005169
ROU    0.004097
FIN    0.003908
Name: country, dtype: float64

In [84]:
rare_encoder = RareLabelEncoder(tol= 0.003900, variables= ['country'], replace_with= 'Other Country')
rare_encoder.fit(X_train)
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

X_train['country'].value_counts(dropna=False, normalize= True).shape

(23,)

In [85]:
X_train['country'].value_counts(dropna=False, normalize= True)

PRT              0.404468
GBR              0.103536
FRA              0.088509
ESP              0.070565
DEU              0.062958
Other Country    0.051063
ITA              0.030780
IRL              0.028871
BEL              0.019973
BRA              0.019046
NLD              0.018280
USA              0.016694
CHE              0.014422
CN               0.010833
AUT              0.010753
SWE              0.008710
CHN              0.008683
POL              0.008038
ISR              0.005497
RUS              0.005282
NOR              0.004839
ROU              0.004113
FIN              0.004086
Name: country, dtype: float64

In [86]:
X_train[numerical_variables].head()

Unnamed: 0,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
65992,121,15,0,4,3,0.0,0,0,0,0,0,0,173.25,0,0
110968,175,18,1,3,2,0.0,0,0,0,0,0,0,99.0,0,2
14150,4,50,0,1,1,0.0,0,0,0,5,0,0,27.0,0,0
116903,88,30,1,2,2,0.0,0,0,0,0,0,0,125.0,0,1
42135,10,36,2,5,2,0.0,0,0,0,0,0,0,85.0,0,0


In [87]:
categorical_variables

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [88]:
country_variable = [categorical_variables[2]]
country_variable

['country']

In [90]:
other_variables = []
for i in categorical_variables:
    if i not in ['country', 'reserved_room_type']:
        other_variables.append(i)
    else:
        pass
        
other_variables

['hotel',
 'meal',
 'market_segment',
 'distribution_channel',
 'deposit_type',
 'customer_type']

#### Testando combinação de  diferentes encoders.

In [91]:
def encoders_model_results_logreg(encoders_cat, encoders_country, model_):
        
    def differentencoders_model_results(x, y):
        other_variables_pipe = Pipeline([('encoder_other_variables', x)])
        country_pipe = Pipeline([('encoder_country', y)])
        num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                             ('num_std_scaler', StandardScaler())])
        
        transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                         ('country_transf', country_pipe, country_variable), 
                                         ('num_transf', num_pipe, numerical_variables)])
        
        X_train_transformed = transformer.fit_transform(X_train, y_train)
        X_test_transformed = transformer.transform(X_test)
    
        cv = cross_validate(model_, X_train_transformed, y_train, cv= 10, scoring= ('recall', 'precision', 'f1'), 
                               n_jobs= -1)
        
        print(model_)
        print('Recall:', cv['test_recall'].mean().round(2), 'Precision:', 
              cv['test_precision'].mean().round(2), 'F1-Score:', cv['test_f1'].mean().round(2))
        print('')
    
    for i in encoders_cat:
        for j in encoders_country:
            print(i, ' - other variables      ', j, ' - country variable')                    
            differentencoders_model_results(i, j)

In [92]:
others = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]
country = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]

#Regressão Logística
encoders_model_results_logreg(others, country, LogisticRegression())

OneHotEncoder(handle_unknown='ignore')  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
LogisticRegression()
Recall: 0.68 Precision: 0.82 F1-Score: 0.74

OneHotEncoder(handle_unknown='ignore')  - other variables       JamesSteinEncoder()  - country variable
LogisticRegression()
Recall: 0.67 Precision: 0.82 F1-Score: 0.74

JamesSteinEncoder()  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
LogisticRegression()
Recall: 0.64 Precision: 0.82 F1-Score: 0.72

JamesSteinEncoder()  - other variables       JamesSteinEncoder()  - country variable
LogisticRegression()
Recall: 0.64 Precision: 0.82 F1-Score: 0.72



### Logistic Regression

#### OHE

In [93]:
pd.set_option('max.colwidth', None)

other_variables_pipe = Pipeline([('encoder_other_variables', OneHotEncoder(handle_unknown= 'ignore'))])
country_pipe = Pipeline([('encoder_country', OneHotEncoder(handle_unknown= 'ignore'))])
num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                     ('num_std_scaler', StandardScaler())])
   
transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                 ('country_transf', country_pipe, country_variable), 
                                 ('num_transf', num_pipe, numerical_variables)])

logreg = LogisticRegression()

pipe = Pipeline([('transformer', transformer), ('model', logreg)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__class_weight':[{0:x, 1:1-x} for x in weights]}

rscv_logreg = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, 
                    scoring= ['recall', 'precision', 'f1', 'roc_auc'], refit= 'f1', return_train_score= True, random_state= 4)

rscv_logreg.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_logreg.cv_results_)

In [94]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
2,"{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}",0.82,0.82,0.71,0.71,0.76,0.76,0.9,0.9
7,"{'model__class_weight': {0: 0.3200000000000001, 1: 0.6799999999999999}}",0.83,0.83,0.71,0.71,0.76,0.76,0.9,0.9
5,"{'model__class_weight': {0: 0.4050000000000002, 1: 0.5949999999999998}}",0.76,0.76,0.76,0.76,0.76,0.76,0.9,0.9
9,"{'model__class_weight': {0: 0.3000000000000001, 1: 0.7}}",0.84,0.84,0.69,0.69,0.76,0.76,0.9,0.9
4,"{'model__class_weight': {0: 0.2950000000000001, 1: 0.7049999999999998}}",0.85,0.84,0.69,0.69,0.76,0.76,0.9,0.9
3,"{'model__class_weight': {0: 0.2800000000000001, 1: 0.72}}",0.86,0.86,0.67,0.67,0.75,0.75,0.9,0.9
0,"{'model__class_weight': {0: 0.26500000000000007, 1: 0.7349999999999999}}",0.87,0.87,0.66,0.66,0.75,0.75,0.9,0.9
6,"{'model__class_weight': {0: 0.22500000000000003, 1: 0.7749999999999999}}",0.9,0.9,0.63,0.63,0.74,0.74,0.9,0.9
1,"{'model__class_weight': {0: 0.21000000000000002, 1: 0.79}}",0.91,0.91,0.62,0.61,0.73,0.73,0.9,0.9
8,"{'model__class_weight': {0: 0.6100000000000003, 1: 0.3899999999999997}}",0.59,0.59,0.88,0.88,0.7,0.7,0.9,0.9


In [95]:
print(rscv_logreg.best_params_)
print('')

y_pred = rscv_logreg.predict(X_test)

print(classification_report(y_test, y_pred))

{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}

              precision    recall  f1-score   support

           0       0.89      0.80      0.84     23067
           1       0.71      0.83      0.77     13578

    accuracy                           0.81     36645
   macro avg       0.80      0.82      0.80     36645
weighted avg       0.82      0.81      0.81     36645



- Percebe-se que o modelo generalizou de forma consistente quando foi alimentado com os dados responsáveis para simular o modelo em produção. Os valores se manteram bastante próximos das métricas de teste.

####  OHE + JamesStein 

In [96]:
other_variables_pipe = Pipeline([('encoder_other_variables', OneHotEncoder(handle_unknown= 'ignore'))])
country_pipe = Pipeline([('encoder_country', JamesSteinEncoder())])
num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                     ('num_std_scaler', StandardScaler())])
   
transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                 ('country_transf', country_pipe, country_variable), 
                                 ('num_transf', num_pipe, numerical_variables)])

logreg = LogisticRegression()

pipe = Pipeline([('transformer', transformer), ('model', logreg)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__class_weight':[{0:x, 1:1-x} for x in weights]}

rscv_logreg_jss = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, 
                    scoring= ['recall', 'precision', 'f1', 'roc_auc'], refit= 'f1', return_train_score= True, random_state= 4)

rscv_logreg_jss.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_logreg_jss.cv_results_)

In [97]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
2,"{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}",0.82,0.82,0.71,0.71,0.76,0.76,0.9,0.9
7,"{'model__class_weight': {0: 0.3200000000000001, 1: 0.6799999999999999}}",0.82,0.82,0.7,0.7,0.76,0.76,0.9,0.9
5,"{'model__class_weight': {0: 0.4050000000000002, 1: 0.5949999999999998}}",0.75,0.75,0.76,0.76,0.76,0.76,0.9,0.9
9,"{'model__class_weight': {0: 0.3000000000000001, 1: 0.7}}",0.84,0.84,0.69,0.69,0.76,0.76,0.9,0.9
4,"{'model__class_weight': {0: 0.2950000000000001, 1: 0.7049999999999998}}",0.84,0.84,0.68,0.68,0.76,0.75,0.9,0.9
3,"{'model__class_weight': {0: 0.2800000000000001, 1: 0.72}}",0.86,0.86,0.67,0.67,0.75,0.75,0.9,0.9
0,"{'model__class_weight': {0: 0.26500000000000007, 1: 0.7349999999999999}}",0.87,0.87,0.66,0.66,0.75,0.75,0.9,0.9
6,"{'model__class_weight': {0: 0.22500000000000003, 1: 0.7749999999999999}}",0.9,0.9,0.62,0.62,0.74,0.74,0.9,0.9
1,"{'model__class_weight': {0: 0.21000000000000002, 1: 0.79}}",0.91,0.91,0.61,0.61,0.73,0.73,0.9,0.9
8,"{'model__class_weight': {0: 0.6100000000000003, 1: 0.3899999999999997}}",0.58,0.58,0.88,0.88,0.7,0.7,0.9,0.9


In [98]:
print(rscv_logreg_jss.best_params_)
print('')

y_pred = rscv_logreg_jss.predict(X_test)

print(classification_report(y_test, y_pred))

pd.reset_option('max.colwidth')

{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}

              precision    recall  f1-score   support

           0       0.89      0.80      0.84     23067
           1       0.71      0.82      0.76     13578

    accuracy                           0.81     36645
   macro avg       0.80      0.81      0.80     36645
weighted avg       0.82      0.81      0.81     36645



In [99]:
jb.dump(rscv_logreg, 'logistic_regression_model')

['logistic_regression_model']

### Análise das Features  do modelo 

In [100]:
feature_importance = pd.DataFrame(rscv_logreg.best_estimator_['model'].coef_.T, columns= ['importance'], 
                                  index= rscv_logreg.best_estimator_['transformer'].get_feature_names_out())
feature_importance.sort_values('importance', ascending= False).round(2).style.bar(color=['red', 'green'])

Unnamed: 0,importance
other_variables_transf__deposit_type_Non Refund,2.56
country_transf__country_PRT,1.93
other_variables_transf__market_segment_Online TA,1.21
country_transf__country_CHN,0.97
num_transf__previous_cancellations,0.8
num_transf__lead_time,0.78
other_variables_transf__customer_type_Transient,0.6
country_transf__country_RUS,0.59
country_transf__country_BRA,0.49
other_variables_transf__market_segment_Complementary,0.46


### AdaBoostClassifier | Estimador Base: Logistic Regression

In [27]:
categorical_variables_pipe = Pipeline([('encoder_categorical_variables', OneHotEncoder(handle_unknown= 'ignore'))])
num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                     ('num_std_scaler', StandardScaler())])
   
transformer = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown= 'ignore'), categorical_variables),
                                 ('pwr', PowerTransformer(), numerical_variables),
                                 ('stdscaler', StandardScaler(), numerical_variables)])

ada_logreg = AdaBoostClassifier(LogisticRegression(class_weight= {0:0.325, 1:0.675}), n_estimators= 100, random_state=4)

pipe = Pipeline([('transformer', transformer), ('model', ada_logreg)])

param_grid = {'model__learning_rate': np.arange(1, 1.5, 0.05)}   
    
rscv_ada_logreg = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, scoring= ['recall', 'precision', 'f1', 'roc_auc'], 
                             refit= 'f1', return_train_score= True, random_state= 4)

rscv_ada_logreg.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_ada_logreg.cv_results_)

In [28]:
pd.set_option('max.colwidth', None)
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
8,{'model__learning_rate': 1.4000000000000004},0.82,0.82,0.7,0.7,0.76,0.76,0.9,0.9
7,{'model__learning_rate': 1.3500000000000003},0.82,0.82,0.7,0.7,0.76,0.76,0.9,0.9
9,{'model__learning_rate': 1.4500000000000004},0.82,0.82,0.7,0.7,0.76,0.76,0.9,0.9
6,{'model__learning_rate': 1.3000000000000003},0.82,0.82,0.7,0.7,0.76,0.76,0.9,0.9
5,{'model__learning_rate': 1.2500000000000002},0.82,0.82,0.7,0.7,0.76,0.76,0.9,0.9
4,{'model__learning_rate': 1.2000000000000002},0.82,0.82,0.7,0.7,0.76,0.75,0.9,0.9
3,{'model__learning_rate': 1.1500000000000001},0.82,0.82,0.7,0.7,0.75,0.75,0.9,0.9
2,{'model__learning_rate': 1.1},0.82,0.82,0.7,0.7,0.75,0.75,0.9,0.9
1,{'model__learning_rate': 1.05},0.82,0.82,0.7,0.7,0.75,0.75,0.9,0.9
0,{'model__learning_rate': 1.0},0.82,0.82,0.7,0.7,0.75,0.75,0.9,0.9


In [29]:
print(rscv_ada_logreg.best_params_)
print('')

y_pred = rscv_ada_logreg.predict(X_test)
print(classification_report(y_test, y_pred))

{'model__learning_rate': 1.4000000000000004}

              precision    recall  f1-score   support

           0       0.88      0.79      0.84     23067
           1       0.70      0.82      0.76     13578

    accuracy                           0.80     36645
   macro avg       0.79      0.81      0.80     36645
weighted avg       0.82      0.80      0.81     36645



- Percebe-se que o modelo generalizou de forma consistente quando foi alimentado com os dados responsáveis para simular o modelo em produção. Os valores se manteram bastante próximos das métricas de teste. 

In [31]:
jb.dump(value= rscv_ada_logreg, filename= 'ada_logreg_model')

['ada_logreg_model']