In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from category_encoders import JamesSteinEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib as jb

In [2]:
df = pd.read_csv('hotel_bookings.csv')

In [3]:
df = df[~((df['arrival_date_year'] == 2015) & (df['arrival_date_month'] == 'July') | (df['arrival_date_year'] == 2017) &
         (df['arrival_date_month'] == 'August'))]
df.shape

(111689, 32)

In [4]:
df = df[~((df['children'] == 0) & (df['adults'] == 0) & (df['babies'] == 0))]
df.shape

(111513, 32)

In [5]:
df.drop(columns= ['company', 'agent', 'reservation_status_date', 'assigned_room_type', 'reservation_status',
                  'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], 
        inplace= True)
df.shape

(111513, 24)

In [6]:
df.dropna(inplace= True)
df.shape

(111044, 24)

In [7]:
df.duplicated().sum()

31653

In [8]:
df['is_canceled'].value_counts(normalize= True).mul(100).round(2)

0    63.02
1    36.98
Name: is_canceled, dtype: float64

In [9]:
df.to_csv('df_changed.csv', index= False)

In [10]:
X = df.drop(['is_canceled'], axis= 1)
y = df['is_canceled']

categorical_variables = X.select_dtypes('object').columns.to_list()
numerical_variables = X.select_dtypes('number').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 4)

In [11]:
categorical_variables

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [12]:
country_variable = [categorical_variables[2]]
country_variable

['country']

In [13]:
other_variables = []
for i in categorical_variables:
    if i != 'country':
        other_variables.append(i)
    else:
        pass
        
other_variables

['hotel',
 'meal',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

#### Testando diferentes modelos com diferentes encoders.

In [14]:
def encoders_model_results_logreg(encoders_cat, encoders_country, model_):
        
    def differentencoders_model_results(x, y):
        other_variables_pipe = Pipeline([('encoder_other_variables', x)])
        country_pipe = Pipeline([('encoder_country', y)])
        num_pipe = Pipeline([('num_std_scaler', StandardScaler()),
                     ('num_pwr_transf', PowerTransformer())])
        
        transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                         ('country_transf', country_pipe, country_variable), 
                                         ('num_transf', num_pipe, numerical_variables)])
        
        X_train_transformed = transformer.fit_transform(X_train, y_train)
        X_test_transformed = transformer.transform(X_test)
    
        cv = cross_validate(model_, X_train_transformed, y_train, cv= 10, scoring= ('recall', 'precision', 'f1'), 
                               n_jobs= -1)
        
        print(model_)
        print('Recall:', cv['test_recall'].mean().round(2), 'Precision:', 
              cv['test_precision'].mean().round(2), 'F1-Score:', cv['test_f1'].mean().round(2))
        print('')
    
    for i in encoders_cat:
        for j in encoders_country:
            print(i, ' - other variables      ', j, ' - country variable')                    
            differentencoders_model_results(i, j)

In [15]:
others = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]
country = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]

#Regressão Logística
encoders_model_results_logreg(others, country, LogisticRegression())

OneHotEncoder(handle_unknown='ignore')  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
LogisticRegression()
Recall: 0.67 Precision: 0.82 F1-Score: 0.74

OneHotEncoder(handle_unknown='ignore')  - other variables       JamesSteinEncoder()  - country variable
LogisticRegression()
Recall: 0.67 Precision: 0.82 F1-Score: 0.74

JamesSteinEncoder()  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
LogisticRegression()
Recall: 0.64 Precision: 0.82 F1-Score: 0.72

JamesSteinEncoder()  - other variables       JamesSteinEncoder()  - country variable
LogisticRegression()
Recall: 0.63 Precision: 0.82 F1-Score: 0.71



### Logistic Regression

#### OHE

In [16]:
pd.set_option('max.colwidth', None)

other_variables_pipe = Pipeline([('encoder_other_variables', OneHotEncoder(handle_unknown= 'ignore'))])
country_pipe = Pipeline([('encoder_country', OneHotEncoder(handle_unknown= 'ignore'))])
num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                     ('num_std_scaler', StandardScaler())])
   
transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                 ('country_transf', country_pipe, country_variable), 
                                 ('num_transf', num_pipe, numerical_variables)])

logreg = LogisticRegression()

pipe = Pipeline([('transformer', transformer), ('model', logreg)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__class_weight':[{0:x, 1:1-x} for x in weights]}

rscv_logreg = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, 
                    scoring= ['recall', 'precision', 'f1', 'roc_auc'], refit= 'f1', return_train_score= True, random_state= 4)

rscv_logreg.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_logreg.cv_results_)

In [17]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
2,"{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}",0.83,0.82,0.71,0.71,0.76,0.76,0.9,0.9
7,"{'model__class_weight': {0: 0.3200000000000001, 1: 0.6799999999999999}}",0.83,0.83,0.71,0.71,0.76,0.76,0.9,0.9
5,"{'model__class_weight': {0: 0.4050000000000002, 1: 0.5949999999999998}}",0.76,0.76,0.77,0.76,0.76,0.76,0.9,0.9
9,"{'model__class_weight': {0: 0.3000000000000001, 1: 0.7}}",0.84,0.84,0.69,0.69,0.76,0.76,0.9,0.9
4,"{'model__class_weight': {0: 0.2950000000000001, 1: 0.7049999999999998}}",0.85,0.85,0.69,0.69,0.76,0.76,0.9,0.9
3,"{'model__class_weight': {0: 0.2800000000000001, 1: 0.72}}",0.86,0.86,0.68,0.68,0.76,0.76,0.9,0.9
0,"{'model__class_weight': {0: 0.26500000000000007, 1: 0.7349999999999999}}",0.87,0.87,0.66,0.66,0.75,0.75,0.9,0.9
6,"{'model__class_weight': {0: 0.22500000000000003, 1: 0.7749999999999999}}",0.9,0.9,0.63,0.63,0.74,0.74,0.9,0.9
1,"{'model__class_weight': {0: 0.21000000000000002, 1: 0.79}}",0.91,0.91,0.62,0.62,0.74,0.74,0.9,0.9
8,"{'model__class_weight': {0: 0.6100000000000003, 1: 0.3899999999999997}}",0.59,0.59,0.88,0.88,0.71,0.71,0.9,0.9


In [18]:
print(rscv_logreg.best_params_)
print('')

y_pred = rscv_logreg.predict(X_test)

print(classification_report(y_test, y_pred))

{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}

              precision    recall  f1-score   support

           0       0.89      0.80      0.84     23067
           1       0.71      0.83      0.77     13578

    accuracy                           0.81     36645
   macro avg       0.80      0.82      0.80     36645
weighted avg       0.82      0.81      0.81     36645



####  OHE + JamesStein 

In [19]:
other_variables_pipe = Pipeline([('encoder_other_variables', OneHotEncoder(handle_unknown= 'ignore'))])
country_pipe = Pipeline([('encoder_country', JamesSteinEncoder())])
num_pipe = Pipeline([('num_pwr_transf', PowerTransformer()), 
                     ('num_std_scaler', StandardScaler())])
   
transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                 ('country_transf', country_pipe, country_variable), 
                                 ('num_transf', num_pipe, numerical_variables)])

logreg = LogisticRegression()

pipe = Pipeline([('transformer', transformer), ('model', logreg)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__class_weight':[{0:x, 1:1-x} for x in weights]}

rscv_logreg_jss = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, 
                    scoring= ['recall', 'precision', 'f1', 'roc_auc'], refit= 'f1', return_train_score= True, random_state= 4)

rscv_logreg_jss.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_logreg_jss.cv_results_)

In [20]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
7,"{'model__class_weight': {0: 0.3200000000000001, 1: 0.6799999999999999}}",0.82,0.82,0.71,0.7,0.76,0.76,0.9,0.9
2,"{'model__class_weight': {0: 0.3250000000000001, 1: 0.6749999999999998}}",0.82,0.82,0.71,0.71,0.76,0.76,0.9,0.9
5,"{'model__class_weight': {0: 0.4050000000000002, 1: 0.5949999999999998}}",0.76,0.75,0.76,0.76,0.76,0.76,0.9,0.9
9,"{'model__class_weight': {0: 0.3000000000000001, 1: 0.7}}",0.84,0.84,0.69,0.69,0.76,0.76,0.9,0.9
4,"{'model__class_weight': {0: 0.2950000000000001, 1: 0.7049999999999998}}",0.84,0.84,0.69,0.69,0.76,0.76,0.9,0.9
3,"{'model__class_weight': {0: 0.2800000000000001, 1: 0.72}}",0.86,0.85,0.67,0.67,0.75,0.75,0.9,0.9
0,"{'model__class_weight': {0: 0.26500000000000007, 1: 0.7349999999999999}}",0.87,0.87,0.66,0.66,0.75,0.75,0.9,0.9
6,"{'model__class_weight': {0: 0.22500000000000003, 1: 0.7749999999999999}}",0.9,0.9,0.63,0.63,0.74,0.74,0.9,0.9
1,"{'model__class_weight': {0: 0.21000000000000002, 1: 0.79}}",0.91,0.91,0.61,0.61,0.73,0.73,0.9,0.9
8,"{'model__class_weight': {0: 0.6100000000000003, 1: 0.3899999999999997}}",0.59,0.58,0.88,0.88,0.7,0.7,0.9,0.9


In [21]:
print(rscv_logreg_jss.best_params_)
print('')

y_pred = rscv_logreg_jss.predict(X_test)

print(classification_report(y_test, y_pred))

pd.reset_option('max.colwidth')

{'model__class_weight': {0: 0.3200000000000001, 1: 0.6799999999999999}}

              precision    recall  f1-score   support

           0       0.89      0.80      0.84     23067
           1       0.70      0.83      0.76     13578

    accuracy                           0.81     36645
   macro avg       0.80      0.81      0.80     36645
weighted avg       0.82      0.81      0.81     36645

