In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Carregando os dados

In [2]:
import os
import numpy as np
import pandas as pd

WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')

df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))

df_train = df_abt.query('data_ref_safra < "2018-03-01"')

df_oot = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'
features = cat_vars + num_vars

# Dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# Dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

print('Conjunto de Dados:', X_train.shape)
print('Conjunto de Teste:', X_oot.shape)

Conjunto de Dados: (3495, 6)
Conjunto de Teste: (1874, 6)


## Pipeline utilizado

Vamos utilizar o estimador LogisticRegression para testar todos os cenários

In [3]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [4]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper

# Criando Pipelines
data_pipeline = Pipeline(steps=[
    ('numeric_imputer', MeanMedianImputer(variables=num_vars)),
    ('std', SklearnTransformerWrapper(transformer=StandardScaler(), variables=num_vars)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
])

In [5]:
data_pipeline.fit(X_train)

In [6]:
X_train.head(3)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,3,3,1,2685.0,74
1,ES,171,207,9,21275.23,2
2,SP,38,42,15,781.8,2


In [7]:
X_train_transformado = data_pipeline.transform(X_train)
X_train_transformado.head(3)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
0,-0.31483,-0.320346,-0.439887,-0.088916,0.000132,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.896072,2.070354,-0.062829,1.644549,-0.754429,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.145774,0.1367,0.219964,-0.266382,-0.754429,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X_oot_transformado = data_pipeline.transform(X_oot)
X_oot_transformado.head(3)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
3495,-0.31483,-0.320346,-0.439887,-0.088916,0.618452,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3496,1.988193,2.093792,-0.062829,1.676803,-0.691548,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3497,0.224735,0.207014,0.455625,-0.243312,-0.733469,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Também podemos incluir o algoritmo dentro do pipeline.

In [9]:
# Criando Pipelines
from sklearn.linear_model import LogisticRegression

rl_pipeline = Pipeline(steps=[
    ('numeric_imputer', MeanMedianImputer(variables=num_vars)),
    ('std', SklearnTransformerWrapper(transformer=StandardScaler(), variables=num_vars)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('rl', LogisticRegression())
])

rl_pipeline.fit(X_train, y_train) # Temos que passar o y_train, uma vez que estamos criando o modelo na etapa final do pipeline

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, rl_pipeline.predict(X_train))

0.8266094420600858

In [11]:
accuracy_score(y_oot, rl_pipeline.predict(X_oot))

0.8255069370330843