In [6]:
# instalando biblioteca para rodar bayesian search
!pip install hyperopt --q


[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
# carregando bibliotecas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb
import hyperopt

In [9]:
# carregando os dados em dataframes separados

dfcontract = pd.read_csv('contract.csv')
dfinternet = pd.read_csv('internet.csv')
dfpersonal = pd.read_csv('personal.csv')
dfphone = pd.read_csv('phone.csv')

In [10]:
# convertendo a coluna totalchargers de object para numérica e descartando registros com NaN, pois precisamos de números e não strings

dfcontract["TotalCharges"] = pd.to_numeric(dfcontract["TotalCharges"], errors='coerce')
print(dfcontract["TotalCharges"].isnull().sum())
dfcontract = dfcontract.dropna(subset=['TotalCharges'])

11


In [11]:
# criando uma coluna binária para enddate
dfcontract["EndDateYN"] = np.where(dfcontract["EndDate"] != 'No', 1,0)

In [12]:
# consolidando os dataframes usando outer para preservar o maior número de dados possível e porque usuários podem assinar produtos diferentes

totaldf = pd.merge(dfcontract,dfinternet, on='customerID', how='outer')
totaldf = pd.merge(totaldf,dfpersonal,on='customerID',how='outer')
totaldf = pd.merge(totaldf,dfphone, on='customerID',how='outer')
totaldf = totaldf.fillna(0.0)
totaldf.sample(10)

Unnamed: 0,customerID,BeginDate,EndDate,Type,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,EndDateYN,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,gender,SeniorCitizen,Partner,Dependents,MultipleLines
3856,5423-BHIXO,2017-06-01,No,Month-to-month,Yes,Electronic check,54.2,1739.6,0.0,DSL,...,Yes,No,No,No,No,Female,0,No,No,Yes
1366,2027-OAQQC,2016-04-01,2019-11-01 00:00:00,Month-to-month,Yes,Bank transfer (automatic),49.05,2076.2,1.0,DSL,...,No,Yes,No,Yes,Yes,Female,0,No,No,0.0
6586,9362-MWODR,2016-10-01,No,Month-to-month,Yes,Credit card (automatic),64.1,2460.35,0.0,DSL,...,No,No,Yes,Yes,No,Female,0,No,Yes,No
4606,6496-SLWHQ,2019-10-01,2020-01-01 00:00:00,Month-to-month,Yes,Electronic check,105.0,294.45,1.0,Fiber optic,...,Yes,Yes,No,Yes,Yes,Male,1,No,No,Yes
4684,6609-MXJHJ,2017-08-01,No,Two year,Yes,Mailed check,25.1,789.55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Female,0,Yes,Yes,Yes
2618,3727-JEZTU,2020-01-01,No,Month-to-month,No,Mailed check,20.4,20.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Female,0,No,No,No
3190,4570-QHXHL,2019-05-01,No,Month-to-month,Yes,Electronic check,43.75,405.7,0.0,DSL,...,No,No,No,No,No,Female,0,No,No,No
3923,5520-FVEWJ,2018-10-01,2019-10-01 00:00:00,Month-to-month,Yes,Electronic check,84.5,916.9,1.0,Fiber optic,...,Yes,Yes,No,No,No,Female,0,Yes,Yes,Yes
5004,7030-FZTFM,2014-02-01,No,Two year,No,Credit card (automatic),110.9,7922.75,0.0,Fiber optic,...,Yes,Yes,Yes,Yes,Yes,Male,0,Yes,Yes,Yes
4923,6924-TDGMT,2019-03-01,No,One year,Yes,Mailed check,20.55,184.95,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Male,0,Yes,No,No


In [13]:
# alterando cabeçalhos para minúscula pois é uma boa prática

totaldf = totaldf.rename(columns=lambda x: x.lower())
totaldf.sample(10)

Unnamed: 0,customerid,begindate,enddate,type,paperlessbilling,paymentmethod,monthlycharges,totalcharges,enddateyn,internetservice,...,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,gender,seniorcitizen,partner,dependents,multiplelines
1687,2460-FPSYH,2016-08-01,2019-12-01 00:00:00,Month-to-month,Yes,Electronic check,55.8,2109.35,1.0,DSL,...,Yes,No,No,Yes,Yes,Female,1,No,No,0.0
2986,4280-DLSHD,2019-06-01,No,Month-to-month,Yes,Mailed check,54.75,445.85,0.0,DSL,...,Yes,Yes,No,No,No,Male,0,Yes,No,No
6623,9412-ARGBX,2015-12-01,2019-12-01 00:00:00,Two year,Yes,Mailed check,95.5,4627.85,1.0,Fiber optic,...,Yes,Yes,Yes,Yes,No,Female,0,No,Yes,No
5577,7854-EDSSA,2018-02-01,2019-12-01 00:00:00,Month-to-month,No,Electronic check,59.0,1254.7,1.0,DSL,...,Yes,No,No,No,No,Male,0,No,No,Yes
402,0599-XNYDO,2018-06-01,No,Month-to-month,Yes,Bank transfer (automatic),69.8,1540.35,0.0,Fiber optic,...,No,No,No,No,No,Female,0,Yes,No,No
5827,8201-AAXCB,2018-01-01,No,Month-to-month,No,Electronic check,60.35,1404.65,0.0,DSL,...,No,No,No,Yes,No,Male,0,Yes,Yes,Yes
3569,5057-LCOUI,2016-10-01,2020-01-01 00:00:00,Month-to-month,Yes,Electronic check,50.75,2011.4,1.0,DSL,...,No,Yes,No,Yes,Yes,Female,0,No,No,0.0
4397,6199-IPCAO,2017-09-01,No,Two year,No,Mailed check,26.1,692.55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Female,0,Yes,Yes,Yes
715,1052-QJIBV,2014-03-01,No,Two year,No,Credit card (automatic),19.9,1397.3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Female,0,Yes,Yes,No
1532,2239-CGBUZ,2015-11-01,No,One year,Yes,Bank transfer (automatic),60.5,3145.15,0.0,DSL,...,Yes,No,Yes,No,No,Female,0,Yes,No,No


In [14]:
# alterando valores Yes e No para 1 e 0, pois a coluna objetivo precisa ser numérica para o modelo poder ser treinado

totaldf = totaldf.replace({'Yes': 1, 'No': 0})
totaldf.sample(10)

  totaldf = totaldf.replace({'Yes': 1, 'No': 0})


Unnamed: 0,customerid,begindate,enddate,type,paperlessbilling,paymentmethod,monthlycharges,totalcharges,enddateyn,internetservice,...,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,gender,seniorcitizen,partner,dependents,multiplelines
5286,7463-IFMQU,2014-02-01,0,Two year,0.0,Bank transfer (automatic),20.05,1423.65,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Female,0,1,0,0.0
3324,4735-ASGMA,2017-09-01,2019-11-01 00:00:00,Month-to-month,1.0,Electronic check,98.35,2515.3,1.0,Fiber optic,...,0.0,1.0,0.0,1.0,1.0,Male,0,0,0,1.0
1089,1585-MQSSU,2018-08-01,2020-01-01 00:00:00,Month-to-month,0.0,Mailed check,51.5,900.5,1.0,DSL,...,0.0,0.0,1.0,0.0,0.0,Male,0,0,0,0.0
2245,3208-YPIOE,2016-10-01,2020-01-01 00:00:00,Month-to-month,0.0,Electronic check,75.25,3017.65,1.0,Fiber optic,...,0.0,0.0,0.0,0.0,0.0,Male,0,0,0,1.0
5506,7753-USQYQ,2015-07-01,0,One year,1.0,Electronic check,64.2,3627.3,0.0,DSL,...,1.0,0.0,1.0,0.0,1.0,Male,0,0,0,0.0
3381,4815-YOSUK,2014-02-01,0,One year,1.0,Credit card (automatic),104.9,7537.5,0.0,Fiber optic,...,1.0,1.0,1.0,0.0,1.0,Male,0,1,1,1.0
3625,5136-KCKGI,2017-02-01,2019-11-01 00:00:00,One year,1.0,Mailed check,103.7,3467.0,1.0,Fiber optic,...,0.0,1.0,0.0,1.0,1.0,Female,0,1,1,1.0
6595,9372-TXXPS,2015-02-01,0,Two year,0.0,Bank transfer (automatic),59.85,3483.45,0.0,DSL,...,1.0,0.0,1.0,0.0,0.0,Female,0,1,0,0.0
1936,2811-POVEX,2018-03-01,0,Month-to-month,1.0,Bank transfer (automatic),88.45,2130.55,0.0,Fiber optic,...,1.0,1.0,0.0,0.0,1.0,Female,1,1,1,0.0
2644,3756-VNWDH,2014-09-01,0,One year,1.0,Electronic check,100.75,6674.65,0.0,Fiber optic,...,0.0,0.0,0.0,1.0,1.0,Male,1,1,0,1.0


In [15]:
# codificando colunas usando one hot pois são poucas opções de valores textuais, ou são binários e não há relação entre eles

totaldf_encoded = pd.get_dummies(totaldf,columns=['type','paymentmethod'])
totaldf_encoded.sample(10)

Unnamed: 0,customerid,begindate,enddate,paperlessbilling,monthlycharges,totalcharges,enddateyn,internetservice,onlinesecurity,onlinebackup,...,multiplelines,type_0.0,type_Month-to-month,type_One year,type_Two year,paymentmethod_0.0,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
6828,9700-ISPUP,2019-04-01,0,1.0,65.5,616.9,0.0,DSL,0.0,1.0,...,0.0,False,True,False,False,False,False,False,True,False
3832,5383-MMTWC,2019-04-01,2019-12-01 00:00:00,1.0,84.0,613.4,1.0,Fiber optic,0.0,0.0,...,1.0,False,True,False,False,False,False,False,True,False
985,1429-UYJSV,2019-02-01,0,0.0,19.7,260.9,0.0,0.0,0.0,0.0,...,0.0,False,False,True,False,False,False,False,False,True
6201,8775-CEBBJ,2019-02-01,2019-11-01 00:00:00,1.0,44.2,403.35,1.0,DSL,0.0,0.0,...,0.0,False,True,False,False,False,True,False,False,False
6726,9552-TGUZV,2019-06-01,0,1.0,75.0,658.1,0.0,Fiber optic,1.0,0.0,...,0.0,False,True,False,False,False,False,False,False,True
3897,5481-NTDOH,2014-07-01,0,1.0,107.05,7142.5,0.0,Fiber optic,1.0,0.0,...,1.0,False,False,True,False,False,False,True,False,False
5570,7850-VWJUU,2018-03-01,0,1.0,75.0,1778.5,0.0,Fiber optic,1.0,0.0,...,0.0,False,True,False,False,False,True,False,False,False
1491,2197-OMWGI,2014-06-01,0,1.0,53.0,3656.25,0.0,DSL,0.0,1.0,...,0.0,False,False,False,True,False,False,False,True,False
3469,4915-BFSXL,2014-04-01,0,0.0,68.95,4858.7,0.0,DSL,1.0,0.0,...,1.0,False,False,False,True,False,False,True,False,False
1901,2761-XECQW,2019-06-01,0,1.0,43.35,371.4,0.0,DSL,0.0,1.0,...,0.0,False,True,False,False,False,False,False,False,True


In [16]:
# codificando colunas com one hot dropando a primeira coluna para evitar multicolinearidade nos dados

totaldf_encoded2 = pd.get_dummies(totaldf_encoded,columns=['internetservice','gender'],drop_first=True)
totaldf_encoded2.sample(10)

Unnamed: 0,customerid,begindate,enddate,paperlessbilling,monthlycharges,totalcharges,enddateyn,onlinesecurity,onlinebackup,deviceprotection,...,type_One year,type_Two year,paymentmethod_0.0,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check,internetservice_DSL,internetservice_Fiber optic,gender_Male
4483,6328-ZPBGN,2018-11-01,2019-10-01 00:00:00,1.0,95.15,997.65,1.0,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,True,False
6239,8819-ZBYNA,2015-04-01,0,1.0,109.1,6393.65,0.0,1.0,1.0,0.0,...,False,True,False,False,True,False,False,False,True,False
5631,7932-WPTDS,2019-10-01,2019-11-01 00:00:00,1.0,24.8,24.8,1.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,True,False,False
1755,2565-JSLRY,2019-10-01,2019-11-01 00:00:00,1.0,24.05,24.05,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,True
1258,1862-QRWPE,2016-02-01,0,0.0,20.65,1057.0,0.0,0.0,0.0,0.0,...,False,True,False,True,False,False,False,False,False,False
1905,2774-LVQUS,2018-10-01,2020-01-01 00:00:00,1.0,83.05,1258.3,1.0,0.0,0.0,1.0,...,False,False,False,False,False,True,False,False,True,False
4566,6437-UDQJM,2014-02-01,0,1.0,84.1,6129.65,0.0,1.0,1.0,1.0,...,False,True,False,False,True,False,False,True,False,False
3287,4695-VADHF,2018-07-01,2020-01-01 00:00:00,0.0,57.45,990.85,1.0,0.0,0.0,1.0,...,False,False,False,False,False,True,False,True,False,True
6194,8760-ZRHKE,2014-03-01,0,0.0,69.2,4982.5,0.0,1.0,1.0,0.0,...,True,False,False,False,False,True,False,True,False,False
5983,8436-BJUMM,2017-11-01,2020-01-01 00:00:00,1.0,83.75,2070.6,1.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,False,True,True


In [17]:
# declarando features e target, removendo colunas desnecessárias nas features

target = totaldf_encoded2['enddateyn']
features = totaldf_encoded2.drop(['customerid','begindate','enddate','enddateyn'],axis=1)

In [18]:
# separando conjunto de treino e teste, usando random_state para replicabilidade da divisão

features_train, features_valid, target_train, target_valid = train_test_split(features,target, test_size=.25, random_state=12345)

In [19]:
# treinando modelo arvore de decisão usando gridSearchCV

# Definindo os hiperparametros
param_grid = {
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2,5,7,10,15],
    'class_weight':["balanced"]
}

# Instanciando o modelo
dtc = DecisionTreeClassifier()

# instanciando o GridSearchCV
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5, scoring='roc_auc')

# Fazendo o fit
grid_search.fit(features_train, target_train)

# coletando resultados
print("Best Parameters: ", grid_search.best_params_)
print("Best AUC-ROC Score: ", grid_search.best_score_)

Best Parameters:  {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best AUC-ROC Score:  0.8241719524978522


In [20]:
# treinando modelo floresta aleatória usando gridSearchCV

# Definindo os hiperparametros
param_grid = {
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'n_estimators':[50, 60, 70, 80, 100],
    'max_depth':[2,5,7,10,15],
    'class_weight':["balanced"]
}

# Instanciando o modelo
rfc = RandomForestClassifier()

# instanciando o GridSearchCV
rfcgrid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='roc_auc')

# treinando o modelo
rfcgrid_search.fit(features_train, target_train)

# coletando resultados
print("Best Parameters: ", rfcgrid_search.best_params_)
print("Best AUC-ROC Score: ", rfcgrid_search.best_score_)

1125 fits failed out of a total of 3375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1125 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\daniz\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\daniz\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\daniz\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\daniz\AppData\Local\Programs\Python\Python31

Best Parameters:  {'class_weight': 'balanced', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 70}
Best AUC-ROC Score:  0.8410025766158853


In [21]:
# treinando modelo regressão usando gridSearchCV

# Definindo os hiperparametros
param_grid = {
    'C': [0.1, 1, 5, 10, 25, 50, 100],
    'penalty': ['l1', 'l2'],
    #'max_iter': [100, 500, 1000],
    #'solver':['liblinear','saga'],
    'class_weight':['balanced', None]
}

# Instanciando o modelo
lr = LogisticRegression(solver='liblinear')

# instanciando o GridSearchCV
lrgrid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# treinando o modelo
lrgrid_search.fit(features_train, target_train)

# coletando resultados
print("Best Parameters: ", lrgrid_search.best_params_)
print("Best AUC-ROC Score: ", lrgrid_search.best_score_)

Best Parameters:  {'C': 50, 'class_weight': None, 'penalty': 'l1'}
Best AUC-ROC Score:  0.8337357417005056


In [22]:
# treinando modelo usando LGBMClassifier

# Instanciando o modelo
lightgb = lgb.LGBMClassifier()

# treinando o modelo
lightgb.fit(features_train, target_train)

# previsões e resultados
target_pred_proba = lightgb.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, target_pred_proba)
print("AUC-ROC:", auc_roc) 

[LightGBM] [Info] Number of positive: 1411, number of negative: 3871
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 552
[LightGBM] [Info] Number of data points in the train set: 5282, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.267134 -> initscore=-1.009214
[LightGBM] [Info] Start training from score -1.009214
AUC-ROC: 0.8404571914996297


In [23]:
# otimizando LGBMClassifier usando gridSearchCV

# listando hiperparametros e opções de valores 
param_grid = {
    'num_leaves': [10, 31, 62, 127],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

# instanciando e treinando o modelo
grid_search = GridSearchCV(lightgb, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(features_train, target_train)

# coletando resultados dos melhores hiperparametros
print("Best parameters:", grid_search.best_params_)
print("Best AUC-ROC:", grid_search.best_score_)

[LightGBM] [Info] Number of positive: 1128, number of negative: 3097
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 552
[LightGBM] [Info] Number of data points in the train set: 4225, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.266982 -> initscore=-1.009988
[LightGBM] [Info] Start training from score -1.009988
[LightGBM] [Info] Number of positive: 1129, number of negative: 3096
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 552
[LightGBM] [Info] Number of data points in the train set: 4225, number of used features: 23
[LightGBM] [Info] [binary:

In [24]:
# treinando modelo usando GradientBoostingClassifier

# Instanciando o modelo
gboost = GradientBoostingClassifier()

# treinando o modelo
gboost.fit(features_train, target_train)

# previsões e resultados
target_pred_proba = gboost.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, target_pred_proba)
print("AUC-ROC:", auc_roc)

AUC-ROC: 0.8582235486130427


In [25]:
# otimizando GradientBoostingClassifier usando gridSearchCV

# listando hiperparametros e opções de valores
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [20, 50, 100, 200]
}

# instanciando e treinando o modelo
grid_search = GridSearchCV(gboost, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(features_train, target_train)

# coletando resultados dos melhores hiperparametros
print("Best parameters:", grid_search.best_params_)
print("Best AUC-ROC:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.05, 'n_estimators': 100}
Best AUC-ROC: 0.8414472570623254


In [26]:
# treinando modelo usando CatBoostClassifier

# Instanciando o modelo
catboost = CatBoostClassifier()

# treinando o modelo
catboost.fit(features_train, target_train)

# previsões e resultados
target_pred_proba = catboost.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, target_pred_proba)
print("AUC-ROC:", auc_roc)

Learning rate set to 0.020969
0:	learn: 0.6785162	total: 152ms	remaining: 2m 31s
1:	learn: 0.6642282	total: 154ms	remaining: 1m 16s
2:	learn: 0.6509378	total: 157ms	remaining: 52s
3:	learn: 0.6399908	total: 159ms	remaining: 39.6s
4:	learn: 0.6280742	total: 177ms	remaining: 35.3s
5:	learn: 0.6179177	total: 186ms	remaining: 30.8s
6:	learn: 0.6083228	total: 190ms	remaining: 26.9s
7:	learn: 0.5989824	total: 201ms	remaining: 24.9s
8:	learn: 0.5894303	total: 205ms	remaining: 22.6s
9:	learn: 0.5813158	total: 209ms	remaining: 20.7s
10:	learn: 0.5732034	total: 212ms	remaining: 19.1s
11:	learn: 0.5650178	total: 216ms	remaining: 17.8s
12:	learn: 0.5572442	total: 220ms	remaining: 16.7s
13:	learn: 0.5510140	total: 223ms	remaining: 15.7s
14:	learn: 0.5444768	total: 233ms	remaining: 15.3s
15:	learn: 0.5381160	total: 238ms	remaining: 14.7s
16:	learn: 0.5323505	total: 248ms	remaining: 14.3s
17:	learn: 0.5270828	total: 252ms	remaining: 13.7s
18:	learn: 0.5221900	total: 256ms	remaining: 13.2s
19:	learn: 

In [27]:
# otimizando CatBoost usando gridSearchCV

# listando hiperparametros e opções de valores
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'depth': [3, 5, 7, 10]
}

# instanciando e treinando o modelo
grid_search = GridSearchCV(catboost, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(features_train, target_train)

# coletando resultados dos melhores hiperparametros
print("Best parameters:", grid_search.best_params_)
print("Best AUC-ROC:", grid_search.best_score_)

0:	learn: 0.6618014	total: 1.6ms	remaining: 1.59s
1:	learn: 0.6360446	total: 3.98ms	remaining: 1.99s
2:	learn: 0.6154948	total: 5.45ms	remaining: 1.81s
3:	learn: 0.5979561	total: 6.71ms	remaining: 1.67s
4:	learn: 0.5819932	total: 8.19ms	remaining: 1.63s
5:	learn: 0.5658500	total: 9.46ms	remaining: 1.57s
6:	learn: 0.5539019	total: 10.8ms	remaining: 1.53s
7:	learn: 0.5427694	total: 12ms	remaining: 1.48s
8:	learn: 0.5315860	total: 13.4ms	remaining: 1.47s
9:	learn: 0.5213452	total: 15ms	remaining: 1.49s
10:	learn: 0.5130235	total: 16.4ms	remaining: 1.47s
11:	learn: 0.5056964	total: 17.8ms	remaining: 1.46s
12:	learn: 0.4992946	total: 19.1ms	remaining: 1.45s
13:	learn: 0.4935662	total: 20.5ms	remaining: 1.44s
14:	learn: 0.4877149	total: 21.9ms	remaining: 1.44s
15:	learn: 0.4837140	total: 23.3ms	remaining: 1.43s
16:	learn: 0.4796360	total: 24.6ms	remaining: 1.42s
17:	learn: 0.4750022	total: 25.9ms	remaining: 1.41s
18:	learn: 0.4718557	total: 27.4ms	remaining: 1.42s
19:	learn: 0.4691527	total:

In [28]:
# treinando modelo usando XGBoost

# Instanciando o modelo
xgboost = xgb.XGBClassifier()

# treinando o modelo
xgboost.fit(features_train, target_train)

# previsões e resultados
target_pred_proba = xgboost.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, target_pred_proba)
print("AUC-ROC:", auc_roc)

AUC-ROC: 0.8305430531490983


In [29]:
# otimizando XGBoost usando gridSearchCV

# listando hiperparametros e opções de valores
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7], 
    'n_estimators': [50, 100, 200]
}

# instanciando e treinando o modelo
grid_search = GridSearchCV(xgboost, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(features_train, target_train)

# coletando resultados dos melhores hiperparametros
print("Best parameters:", grid_search.best_params_)
print("Best AUC-ROC:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Best AUC-ROC: 0.8426740178516298


In [30]:
# treinando modelo usando KNN

# Instanciando o modelo
knn = KNeighborsClassifier()

knn.fit(features_train, target_train)

# previsões e resultados
target_pred_proba = knn.predict_proba(features_valid)[:, 1]
auc_roc = roc_auc_score(target_valid, target_pred_proba)
print("AUC-ROC:", auc_roc)

AUC-ROC: 0.7596828950322903


In [31]:
# otimizando KNN usando gridSearchCV

# listando hiperparametros e opções de valores
param_grid = {'n_neighbors': [3, 5, 7, 9, 20, 50]}

# instanciando e treinando o modelo
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(features_train, target_train)

# coletando resultados dos melhores hiperparametros
print("Best parameters:", grid_search.best_params_)
print("Best AUC-ROC:", grid_search.best_score_)

Best parameters: {'n_neighbors': 20}
Best AUC-ROC: 0.7695557677417751


In [35]:
# Aplicando bayesian para melhorar ainda mais o Gradient Boosting, que apresentou o melhor score

# # listando hiperparametros e intervalos de valores a testar
space = {
    'learning_rate': hyperopt.hp.loguniform('learning_rate', -5, 0),
    'n_estimators': hyperopt.hp.quniform('n_estimators', 10, 100, 10),
    'max_depth': hyperopt.hp.quniform('max_depth', 3, 10, 1),
    'min_samples_split': hyperopt.hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hyperopt.hp.quniform('min_samples_leaf', 1, 5, 1)
}

# criando uma função para instanciar o modelo, treinar e coletar resultados de auc-roc
def objective(params):
    # Train a Gradient Boosting Classifier model with the given hyperparameters
    gbc = GradientBoostingClassifier(
        learning_rate=params['learning_rate'],
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf'])
    )
    gbc.fit(features_train, target_train)

    # Evaluate the model on the testing data
    target_pred_proba = gbc.predict_proba(features_valid)[:, 1]
    auc_roc = roc_auc_score(target_valid, target_pred_proba)

    # Return the negative AUC-ROC score (since hyperopt minimizes the objective function)
    return -auc_roc

# instanciando os trials
trials = hyperopt.Trials()

# rodando os trials para buscar os melhores hiperparametros
best = hyperopt.fmin(objective, space, algo=hyperopt.rand.suggest, trials=trials, max_evals=300)

# coletando resultados
print("Best hyperparameters:", best)
print("Best AUC-ROC:", -trials.best_trial['result']['loss'])

100%|██████████| 300/300 [03:11<00:00,  1.57trial/s, best loss: -0.8611425765867816]
Best hyperparameters: {'learning_rate': 0.03429853902906441, 'max_depth': 5.0, 'min_samples_leaf': 1.0, 'min_samples_split': 8.0, 'n_estimators': 70.0}
Best AUC-ROC: 0.8611425765867816


# Bayesian Results

## Attempt 1 - max_evals=50
Best hyperparameters: {'learning_rate': 0.07081385211123575, 'max_depth': 4.0, 'min_samples_leaf': 2.0, 'min_samples_split': 3.0, 'n_estimators': 80.0}
Best AUC-ROC: 0.8434281355974937

## Attempt 2 - max_evals=100
Best hyperparameters: {'learning_rate': 0.039589185015199285, 'max_depth': 4.0, 'min_samples_leaf': 3.0, 'min_samples_split': 7.0, 'n_estimators': 60.0}
Best AUC-ROC: 0.8444177602965092

## Attempt 3 = max_evals=200
Best hyperparameters: {'learning_rate': 0.03429853902906441, 'max_depth': 5.0, 'min_samples_leaf': 1.0, 'min_samples_split': 8.0, 'n_estimators': 70.0}
Best AUC-ROC: 0.8611425765867816

## Attempt 4 = max_evals=300
Best hyperparameters: {'learning_rate': 0.08386938804518637, 'max_depth': 3.0, 'min_samples_leaf': 3.0, 'min_samples_split': 7.0, 'n_estimators': 50.0}
Best AUC-ROC: 0.8453411999041893

# Conclusão
- **Best Model**: The best-performing model is the **Gradient Boosting Classifier**, with hyperparameters optimized in the third attempt, resulting in an **AUC-ROC** score of approximately **0.86**.
