In [1]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


# Boosting

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import pandas as pd

WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')
df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))

df_train = df_abt.query('data_ref_safra < "2018-03-01"')
df_oot = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'
features = cat_vars + num_vars

# Dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# Dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.metrics import accuracy_score

## Gradient Boosting

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
gbt = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('gradient_booting_tree', GradientBoostingClassifier(random_state=42))
])

In [7]:
gbt.fit(X_train, y_train)

In [8]:
y_pred = gbt.predict(X_oot)
print(gbt[-1].__class__.__name__, accuracy_score(y_oot, y_pred))

GradientBoostingClassifier 0.8388473852721452


## XGBoosting

In [9]:
!pip install xgboost



In [10]:
from xgboost import XGBClassifier

xgb = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('XGBoosting', XGBClassifier(random_state=42))
])

In [11]:
xgb.fit(X_train, y_train)

In [12]:
y_pred = xgb.predict(X_oot)
print(xgb[-1].__class__.__name__, accuracy_score(y_oot, y_pred))

XGBClassifier 0.8617929562433297


## LightGBM

In [13]:
!pip install lightgbm



In [14]:
from lightgbm import LGBMClassifier

lgbm = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('LGBMClassifier', LGBMClassifier(random_state=42))
])

lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1332, number of negative: 2163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 908
[LightGBM] [Info] Number of data points in the train set: 3495, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381116 -> initscore=-0.484815
[LightGBM] [Info] Start training from score -0.484815


In [15]:
y_pred = lgbm.predict(X_oot)
print(lgbm[-1].__class__.__name__, accuracy_score(y_oot, y_pred))

LGBMClassifier 0.8527214514407684


# CatBoost

In [16]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [17]:
from catboost import CatBoostClassifier

catboost = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('CatBoostClassifier', CatBoostClassifier(random_state=42))
])

catboost.fit(X_train, y_train)

Learning rate set to 0.017579
0:	learn: 0.6799136	total: 58.2ms	remaining: 58.2s
1:	learn: 0.6681828	total: 61.8ms	remaining: 30.8s
2:	learn: 0.6565842	total: 65.2ms	remaining: 21.7s
3:	learn: 0.6453770	total: 68.5ms	remaining: 17.1s
4:	learn: 0.6342850	total: 71.8ms	remaining: 14.3s
5:	learn: 0.6237834	total: 75.2ms	remaining: 12.5s
6:	learn: 0.6139152	total: 78.2ms	remaining: 11.1s
7:	learn: 0.6039922	total: 81.3ms	remaining: 10.1s
8:	learn: 0.5947968	total: 84.4ms	remaining: 9.29s
9:	learn: 0.5856757	total: 87.3ms	remaining: 8.64s
10:	learn: 0.5766364	total: 90.3ms	remaining: 8.12s
11:	learn: 0.5680487	total: 93.6ms	remaining: 7.7s
12:	learn: 0.5595120	total: 96.8ms	remaining: 7.35s
13:	learn: 0.5525649	total: 99.6ms	remaining: 7.02s
14:	learn: 0.5447315	total: 102ms	remaining: 6.73s
15:	learn: 0.5382335	total: 106ms	remaining: 6.49s
16:	learn: 0.5320354	total: 108ms	remaining: 6.27s
17:	learn: 0.5254555	total: 111ms	remaining: 6.06s
18:	learn: 0.5192068	total: 114ms	remaining: 5.87

In [18]:
y_pred = catboost.predict(X_oot)
print(lgbm[-1].__class__.__name__, accuracy_score(y_oot, y_pred))

LGBMClassifier 0.8585912486659552
