In [24]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from category_encoders import JamesSteinEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from feature_engine.encoding import RareLabelEncoder
import joblib as jb

In [2]:
df = pd.read_csv('df_changed.csv')

In [3]:
X = df.drop(['is_canceled'], axis= 1)
y = df['is_canceled']

categorical_variables = X.select_dtypes('object').columns.to_list()
numerical_variables = X.select_dtypes('number').columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 4)

In [4]:
rare_encoder = RareLabelEncoder(tol= 0.003900, variables= ['country'], replace_with= 'Other Country')
rare_encoder.fit(X_train)
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

In [5]:
categorical_variables

['hotel',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type']

In [6]:
country_variable = [categorical_variables[2]]

In [7]:
other_variables = []
for i in categorical_variables:
    if i != 'country':
        other_variables.append(i)
    else:
        pass

#### Testando combinação de  diferentes encoders.

In [8]:
def encoders_model_results_tree(encoders_cat, encoders_country, model_):
        
    def differentencoders_model_results(x, y):
        other_variables_pipe = Pipeline([('encoder_other_variables', x)])
        country_pipe = Pipeline([('encoder_country', y)])
        num_pipe = Pipeline([('num_std_scaler', StandardScaler())])
        
        transformer = ColumnTransformer([('other_variables_transf', other_variables_pipe, other_variables),
                                         ('country_transf', country_pipe, country_variable), 
                                         ('num_transf', num_pipe, numerical_variables)])
        
        X_train_transformed = transformer.fit_transform(X_train, y_train)
        X_test_transformed = transformer.transform(X_test)
    
        cv = cross_validate(model_, X_train_transformed, y_train, cv= 10, scoring= ('recall', 'precision', 'f1'), 
                               n_jobs= -1)
        
        print(model_)
        print('Recall:', cv['test_recall'].mean().round(2), 'Precision:', 
              cv['test_precision'].mean().round(2), 'F1-Score:', cv['test_f1'].mean().round(2))
        print('')
    
    for i in encoders_cat:
        for j in encoders_country:
            print(i, ' - other variables      ', j, ' - country variable')                    
            differentencoders_model_results(i, j)

In [9]:
others = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]
country = [OneHotEncoder(handle_unknown= 'ignore'), JamesSteinEncoder()]

#Árvore de Decisão
encoders_model_results_tree(others, country, DecisionTreeClassifier(random_state= 4))

OneHotEncoder(handle_unknown='ignore')  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
DecisionTreeClassifier(random_state=4)
Recall: 0.79 Precision: 0.78 F1-Score: 0.78

OneHotEncoder(handle_unknown='ignore')  - other variables       JamesSteinEncoder()  - country variable
DecisionTreeClassifier(random_state=4)
Recall: 0.78 Precision: 0.78 F1-Score: 0.78

JamesSteinEncoder()  - other variables       OneHotEncoder(handle_unknown='ignore')  - country variable
DecisionTreeClassifier(random_state=4)
Recall: 0.79 Precision: 0.78 F1-Score: 0.78

JamesSteinEncoder()  - other variables       JamesSteinEncoder()  - country variable
DecisionTreeClassifier(random_state=4)
Recall: 0.78 Precision: 0.78 F1-Score: 0.78



### Decision Tree

#### OHE

In [10]:
transformer = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown= 'ignore'), categorical_variables), 
                                  ('stdscaler', StandardScaler(), numerical_variables)])

tree = DecisionTreeClassifier(random_state=4)

pipe = Pipeline([('transformer', transformer), ('model', tree)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__min_samples_leaf':[1,2,3,4,5], 
              'model__class_weight':[{0:x, 1:1-x} for x in weights]}   
    
rscv_tree = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, scoring= ['recall', 'precision', 'f1', 'roc_auc'], 
                             refit= 'f1', return_train_score= True, random_state= 4)

rscv_tree.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_tree.cv_results_)

In [11]:
pd.set_option('max.colwidth', None)
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
9,"{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}",0.99,0.79,1.0,0.78,0.99,0.78,1.0,0.83
6,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4250000000000002, 1: 0.5749999999999997}}",0.98,0.81,0.93,0.75,0.95,0.78,1.0,0.85
4,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.34500000000000014, 1: 0.6549999999999998}}",0.98,0.81,0.92,0.75,0.95,0.78,1.0,0.85
7,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.6450000000000005, 1: 0.35499999999999954}}",0.91,0.76,0.98,0.8,0.95,0.78,1.0,0.85
5,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4100000000000002, 1: 0.5899999999999999}}",0.98,0.81,0.92,0.75,0.95,0.78,1.0,0.85
2,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.5450000000000004, 1: 0.4549999999999996}}",0.92,0.76,0.98,0.8,0.95,0.78,1.0,0.85
3,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.7850000000000006, 1: 0.2149999999999994}}",0.88,0.74,1.0,0.82,0.93,0.78,1.0,0.85
8,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.35500000000000015, 1: 0.6449999999999998}}",0.95,0.81,0.87,0.74,0.91,0.78,0.99,0.88
0,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.6200000000000003, 1: 0.37999999999999967}}",0.84,0.73,0.95,0.82,0.9,0.77,0.99,0.88
1,"{'model__min_samples_leaf': 3, 'model__class_weight': {0: 0.7300000000000004, 1: 0.2699999999999996}}",0.83,0.72,0.98,0.84,0.9,0.77,0.99,0.87


In [12]:
print(rscv_tree.best_params_)
print('')

tree_final = rscv_tree.best_estimator_.fit(X_train, y_train)
y_pred = tree_final.predict(X_test)
print(classification_report(y_test, y_pred))

{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}

              precision    recall  f1-score   support

           0       0.88      0.87      0.88     23067
           1       0.79      0.80      0.79     13578

    accuracy                           0.84     36645
   macro avg       0.83      0.84      0.83     36645
weighted avg       0.85      0.84      0.85     36645



In [13]:
jb.dump(value= tree_final, filename= 'decision_tree_model')

['decision_tree_model']

####  OHE + JamesStein 

In [14]:
transformer = ColumnTransformer([('other_variables_transf', OneHotEncoder(handle_unknown= 'ignore'), other_variables),
                                 ('country_transf', JamesSteinEncoder(), country_variable),
                                 ('stdscaler', StandardScaler(), numerical_variables)])

tree = DecisionTreeClassifier(random_state=4)

pipe = Pipeline([('transformer', transformer), ('model', tree)])

weights = np.arange(0.2, 0.8, 0.005)
param_grid = {'model__min_samples_leaf':[1,2,3,4,5], 
              'model__class_weight':[{0:x, 1:1-x} for x in weights]}   
    
rscv_tree_jmss = RandomizedSearchCV(pipe, param_grid, cv= 10, n_jobs= -1, scoring= ['recall', 'precision', 'f1', 'roc_auc'], 
                             refit= 'f1', return_train_score= True, random_state= 4)

rscv_tree_jmss.fit(X_train, y_train)

best_results = pd.DataFrame(rscv_tree_jmss.cv_results_)

In [15]:
best_results[['params','mean_train_recall', 'mean_test_recall', 'mean_train_precision', 'mean_test_precision', 
              'mean_train_f1', 'mean_test_f1', 'mean_train_roc_auc', 'mean_test_roc_auc']].sort_values(
                                                                                'mean_test_f1', ascending= False).round(2)

Unnamed: 0,params,mean_train_recall,mean_test_recall,mean_train_precision,mean_test_precision,mean_train_f1,mean_test_f1,mean_train_roc_auc,mean_test_roc_auc
9,"{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}",0.99,0.79,1.0,0.77,0.99,0.78,1.0,0.83
8,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.35500000000000015, 1: 0.6449999999999998}}",0.95,0.81,0.87,0.75,0.91,0.78,0.99,0.88
4,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.34500000000000014, 1: 0.6549999999999998}}",0.98,0.8,0.93,0.75,0.95,0.78,1.0,0.85
6,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4250000000000002, 1: 0.5749999999999997}}",0.98,0.8,0.93,0.75,0.95,0.78,1.0,0.85
5,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.4100000000000002, 1: 0.5899999999999999}}",0.98,0.8,0.93,0.75,0.95,0.78,1.0,0.85
7,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.6450000000000005, 1: 0.35499999999999954}}",0.92,0.76,0.98,0.8,0.95,0.77,1.0,0.85
2,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.5450000000000004, 1: 0.4549999999999996}}",0.92,0.75,0.98,0.8,0.95,0.77,1.0,0.85
0,"{'model__min_samples_leaf': 4, 'model__class_weight': {0: 0.6200000000000003, 1: 0.37999999999999967}}",0.85,0.73,0.96,0.82,0.9,0.77,0.99,0.88
3,"{'model__min_samples_leaf': 2, 'model__class_weight': {0: 0.7850000000000006, 1: 0.2149999999999994}}",0.88,0.73,1.0,0.82,0.93,0.77,1.0,0.85
1,"{'model__min_samples_leaf': 3, 'model__class_weight': {0: 0.7300000000000004, 1: 0.2699999999999996}}",0.84,0.72,0.98,0.83,0.9,0.77,0.99,0.87


In [16]:
print(rscv_tree_jmss.best_params_)
print('')

y_pred = rscv_tree_jmss.predict(X_test)

print(classification_report(y_test, y_pred))

pd.reset_option('max.colwidth')

{'model__min_samples_leaf': 1, 'model__class_weight': {0: 0.7500000000000004, 1: 0.24999999999999956}}

              precision    recall  f1-score   support

           0       0.88      0.87      0.87     23067
           1       0.78      0.80      0.79     13578

    accuracy                           0.84     36645
   macro avg       0.83      0.83      0.83     36645
weighted avg       0.84      0.84      0.84     36645



### GaussianNB

In [25]:
transformer = ColumnTransformer([('jmss', JamesSteinEncoder(), categorical_variables),
                                 ('pwr', PowerTransformer(), numerical_variables ),
                                 ('stdscaler', StandardScaler(), numerical_variables)])

gnb = GaussianNB(var_smoothing= 1.5)

pipe = Pipeline([('transformer', transformer), ('model', gnb)])

gnb_cv = cross_validate(pipe, X_train, y_train, cv= 10, n_jobs= -1, 
                        scoring= ['f1', 'precision', 'recall'], return_train_score= True)

In [26]:
gnb_cv_df = pd.DataFrame(gnb_cv)
gnb_cv_df.mean().round(2)

fit_time           3.93
score_time         0.12
test_f1            0.63
train_f1           0.63
test_precision     0.50
train_precision    0.50
test_recall        0.85
train_recall       0.85
dtype: float64

In [27]:
gnb_final = pipe.fit(X_train, y_train)

print(classification_report(y_test, gnb_final.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.50      0.63     23067
           1       0.50      0.86      0.64     13578

    accuracy                           0.63     36645
   macro avg       0.68      0.68      0.63     36645
weighted avg       0.73      0.63      0.63     36645



In [28]:
jb.dump(value= gnb_final, filename= 'gaussian_nb_model')

['gaussian_nb_model']