In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Carregando os dados

In [2]:
import pandas as pd
import os

WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')
df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))

# pega a base de treinamento
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

# pega a base de avaliação (out of time)
df_oot   = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]



In [3]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [4]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

dt = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('Decision_Tree', DecisionTreeClassifier())
])

## Grid Search

In [5]:
# Importamos GridSearchCV
from sklearn.model_selection import GridSearchCV

# Criamos um dicionário que os hiperparâmetros que queremos treinar o modelo
parameters = {
  'Decision_Tree__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Criamos um GridSearch passando o modelo, os parâmetros, a métrica que queremos otimizar.
grid_search = GridSearchCV(dt, parameters, scoring='roc_auc', cv=5, n_jobs=-1)

# Realiza o treinamento com os dados
grid_search.fit(X_train, y_train)

In [6]:
# criando uma tabela com os resultados do grid_search
results = pd.DataFrame(grid_search.cv_results_)

# rankeando esses resultados
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Decision_Tree__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.268214,0.030594,0.184664,0.018283,4,{'Decision_Tree__max_depth': 4},0.89905,0.870713,0.900654,0.90516,0.898453,0.894806,0.012273,1
4,0.231622,0.042509,0.186807,0.019362,5,{'Decision_Tree__max_depth': 5},0.899596,0.872724,0.896551,0.900762,0.892545,0.892436,0.010258,2
2,0.222832,0.051051,0.157715,0.050669,3,{'Decision_Tree__max_depth': 3},0.892092,0.869464,0.892527,0.902151,0.895258,0.890299,0.011021,3
5,0.232839,0.0637,0.128406,0.050198,6,{'Decision_Tree__max_depth': 6},0.886496,0.857201,0.879699,0.882799,0.888707,0.87898,0.011318,4
1,0.191277,0.057861,0.171765,0.04372,2,{'Decision_Tree__max_depth': 2},0.878607,0.848479,0.880941,0.885121,0.881562,0.874942,0.013395,5
6,0.190982,0.01758,0.108592,0.014816,7,{'Decision_Tree__max_depth': 7},0.876339,0.853959,0.86764,0.868321,0.877099,0.868672,0.008336,6
7,0.116347,0.026213,0.078768,0.031645,8,{'Decision_Tree__max_depth': 8},0.862052,0.828812,0.84708,0.863863,0.85565,0.851491,0.012772,7
8,0.183383,0.016469,0.094671,0.028526,9,{'Decision_Tree__max_depth': 9},0.852038,0.809669,0.833688,0.859231,0.841649,0.839255,0.017173,8
9,0.158379,0.027573,0.080989,0.038569,10,{'Decision_Tree__max_depth': 10},0.825071,0.801286,0.831539,0.837612,0.828218,0.824745,0.012444,9
0,0.195175,0.054208,0.105321,0.033148,1,{'Decision_Tree__max_depth': 1},0.805868,0.791368,0.817157,0.824619,0.812373,0.810277,0.011264,10


Vamos realizar um GridSearch com uma Random Forest.

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('Random_Forest', RandomForestClassifier(n_jobs=-1))
])

In [8]:
parameters = {'Random_Forest__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'Random_Forest__n_estimators': [100, 300, 500]}

grid_search = GridSearchCV(rf, parameters, scoring='roc_auc', cv=5, n_jobs=-1)

In [9]:
grid_search.fit(X_train, y_train)

In [10]:
results = pd.DataFrame(grid_search.cv_results_)
results.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Random_Forest__max_depth,param_Random_Forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,2.062982,0.575846,0.198326,0.029786,10,300,"{'Random_Forest__max_depth': 10, 'Random_Fores...",0.924322,0.90602,0.915661,0.925333,0.922155,0.918698,0.007177,1
29,3.555834,0.908974,0.26716,0.080974,10,500,"{'Random_Forest__max_depth': 10, 'Random_Fores...",0.924001,0.903133,0.914254,0.925394,0.921226,0.917602,0.00819,2
27,0.782887,0.164576,0.194451,0.118591,10,100,"{'Random_Forest__max_depth': 10, 'Random_Fores...",0.921227,0.902362,0.915756,0.925029,0.921252,0.917125,0.007952,3
24,0.595426,0.01006,0.092512,0.004474,9,100,"{'Random_Forest__max_depth': 9, 'Random_Forest...",0.922129,0.90143,0.913343,0.922859,0.917858,0.915523,0.007826,4
25,1.646715,0.022881,0.165427,0.011881,9,300,"{'Random_Forest__max_depth': 9, 'Random_Forest...",0.921929,0.900333,0.913638,0.923119,0.918483,0.9155,0.008268,5


In [11]:
grid_search.best_estimator_.steps[-1][1].get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Exercício

**[Responda]** Faça um Grid Search com o LightGBM

## Randomized Search

In [12]:
from lightgbm import LGBMClassifier

lgbm = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('LGBM', LGBMClassifier(n_jobs=-1))
])

In [13]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'LGBM__learning_rate': [0.001, 0.01],
              'LGBM__num_leaves': [2, 128],
              'LGBM__min_child_samples': [1, 100],
              'LGBM__subsample': [0.05, 1.0],
              'LGBM__colsample_bytree': [0.1, 1.0]}

random_search = RandomizedSearchCV(lgbm, parameters, scoring='roc_auc', cv=5, n_iter=5, n_jobs=-1)

random_search.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1332, number of negative: 2163
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 936
[LightGBM] [Info] Number of data points in the train set: 3495, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381116 -> initscore=-0.484815
[LightGBM] [Info] Start training from score -0.484815


In [14]:
results = pd.DataFrame(random_search.cv_results_)
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_LGBM__subsample,param_LGBM__num_leaves,param_LGBM__min_child_samples,param_LGBM__learning_rate,param_LGBM__colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,3.245349,2.660671,0.063144,0.019764,0.05,128,1,0.01,1.0,"{'LGBM__subsample': 0.05, 'LGBM__num_leaves': ...",0.890983,0.880193,0.900346,0.901522,0.907482,0.896105,0.009552,1
0,2.102325,0.50035,0.155004,0.059919,1.0,128,1,0.01,0.1,"{'LGBM__subsample': 1.0, 'LGBM__num_leaves': 1...",0.86223,0.855146,0.874386,0.885247,0.863099,0.868022,0.01059,2
1,3.365517,2.121757,0.062515,0.012805,1.0,128,1,0.001,0.1,"{'LGBM__subsample': 1.0, 'LGBM__num_leaves': 1...",0.855467,0.85069,0.868247,0.878649,0.856032,0.861817,0.01022,3
2,1.18767,0.473722,0.099921,0.028669,1.0,2,100,0.01,0.1,"{'LGBM__subsample': 1.0, 'LGBM__num_leaves': 2...",0.850322,0.847465,0.866928,0.828075,0.857573,0.850072,0.012893,4
3,0.48274,0.257974,0.13526,0.04346,0.05,2,100,0.001,1.0,"{'LGBM__subsample': 0.05, 'LGBM__num_leaves': ...",0.830533,0.815877,0.870201,0.842861,0.848574,0.841609,0.018179,5


In [15]:
from sklearn.utils.fixes import loguniform

parameters = {'LGBM__learning_rate': loguniform(1e-3, 1e-1),
              'LGBM__num_leaves': [2, 128],
              'LGBM__min_child_samples': [1, 100],
              'LGBM__subsample': [0.05, 1.0],
              'LGBM__colsample_bytree': [0.1, 1.0]}

random_search = RandomizedSearchCV(lgbm, parameters, scoring='roc_auc', cv=5, n_iter=30, n_jobs=-1)

In [16]:
random_search.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1332, number of negative: 2163
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 936
[LightGBM] [Info] Number of data points in the train set: 3495, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381116 -> initscore=-0.484815
[LightGBM] [Info] Start training from score -0.484815


In [17]:
results = pd.DataFrame(random_search.cv_results_)
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_LGBM__colsample_bytree,param_LGBM__learning_rate,param_LGBM__min_child_samples,param_LGBM__num_leaves,param_LGBM__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,7.885187,4.194302,0.085431,0.058641,1.0,0.077098,1,128,0.05,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.917204,0.905699,0.911988,0.920592,0.926444,0.916386,0.007114,1
8,4.280187,1.521359,0.08031,0.039499,1.0,0.026777,100,128,1.0,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.911326,0.888507,0.904635,0.917775,0.906822,0.905813,0.009751,2
6,1.309485,0.721599,0.10346,0.050755,1.0,0.079163,1,2,0.05,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.908946,0.886808,0.904166,0.916811,0.907569,0.90486,0.009932,3
0,1.760899,0.901354,0.106286,0.046739,1.0,0.009163,100,128,1.0,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.911395,0.886951,0.902924,0.915644,0.906467,0.904676,0.009858,4
27,0.733707,0.256409,0.131417,0.028545,1.0,0.051393,1,2,1.0,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.90801,0.886015,0.904079,0.916052,0.90875,0.904581,0.010057,5
20,1.741341,1.064551,0.058806,0.01491,1.0,0.008345,100,128,1.0,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.911578,0.886171,0.902616,0.915674,0.906836,0.904575,0.010197,6
22,2.723231,0.570544,0.107823,0.060875,1.0,0.002697,100,128,0.05,"{'LGBM__colsample_bytree': 1.0, 'LGBM__learnin...",0.909779,0.883912,0.902208,0.913434,0.904127,0.902692,0.010204,7
26,0.729271,0.195958,0.148039,0.015919,0.1,0.087756,100,2,1.0,"{'LGBM__colsample_bytree': 0.1, 'LGBM__learnin...",0.891529,0.873348,0.889649,0.876192,0.886424,0.883428,0.007311,8
16,1.201286,1.01105,0.066006,0.018048,0.1,0.096554,1,2,1.0,"{'LGBM__colsample_bytree': 0.1, 'LGBM__learnin...",0.883869,0.867509,0.883224,0.88997,0.877494,0.880413,0.007566,9
24,0.993362,1.067634,0.057802,0.00306,0.1,0.071356,100,2,0.05,"{'LGBM__colsample_bytree': 0.1, 'LGBM__learnin...",0.887892,0.869568,0.886619,0.869958,0.883376,0.879483,0.008072,10


### Exercício

**[Responda]** Faça a mesma busca de hiperparâmetros usando uma Random Search, mas agora com o XGBoosting.