In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')

In [4]:
df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))

In [5]:
df_abt.head()

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


## Identificar as variáveis de modelagem

In [6]:
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

df_oot = df_abt.query('data_ref_safra == "2018-03-01"')

In [7]:
key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'
features = cat_vars + num_vars

# Dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# Dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

In [8]:
X_train.head()

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,3,3,1,2685.0,74
1,ES,171,207,9,21275.23,2
2,SP,38,42,15,781.8,2
3,GO,1,1,1,120.0,16
4,SP,130,141,75,16228.88,8


In [9]:
y_train.head()

0    1
1    0
2    0
3    1
4    0
Name: nao_revendeu_next_6m, dtype: int64

In [10]:
X_train.shape, X_oot.shape

((3495, 6), (1874, 6))

In [11]:
y_train.shape, y_oot.shape

((3495,), (1874,))

# Aplicando Feature Engineering

- Imputação de Missing Values
- Feature Scaling -> só pra modelos lineares (regressão logística, SVM, redes neurais) Exceção: qualquer modelo baseado em árvore
- Decodificação/tratamento das Variáveis Categóricas

In [12]:
X_train.isnull().sum()

uf                    0
tot_orders_12m        0
tot_items_12m         0
tot_items_dist_12m    0
receita_12m           0
recencia              0
dtype: int64

In [13]:
X_train.describe()

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
count,3495.0,3495.0,3495.0,3495.0,3495.0
mean,26.923033,30.335336,10.333047,3638.560727,73.987411
std,75.997937,85.342871,21.219937,10725.853565,95.433442
min,1.0,1.0,1.0,6.0,0.0
25%,2.0,2.0,2.0,220.49,8.0
50%,6.0,7.0,4.0,809.0,28.0
75%,21.0,23.0,10.0,2816.325,106.0
max,1276.0,1328.0,297.0,184408.31,364.0


Na nossa base de treinamento não temos valores faltantes. Mesmo assim, é bom criar uma estratégia de imputação pois em produção pode acontecer de vir dados faltantes.

Estratégia para imputação de missing values:

* Variáveis categóricas: substitui o valor faltante pela palavra `missing`.
* Variáveis numéricas: substitui o valor faltante com a média/mediana da variável.

In [14]:
X_train[num_vars].mean()

tot_orders_12m          26.923033
tot_items_12m           30.335336
tot_items_dist_12m      10.333047
receita_12m           3638.560727
recencia                73.987411
dtype: float64

In [15]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/328.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [16]:
from feature_engine.imputation import MeanMedianImputer
mmi = MeanMedianImputer(variables=num_vars, imputation_method='median')
mmi.fit(X_train)
mmi.transform(X_train).head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,3,3,1,2685.0,74
1,ES,171,207,9,21275.23,2


In [17]:
mmi.transform(X_train).head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,3,3,1,2685.0,74
1,ES,171,207,9,21275.23,2


In [18]:
from feature_engine.imputation import CategoricalImputer
ci = CategoricalImputer(variables=cat_vars , fill_value='not_av')
ci.fit(X_train)
ci.transform(X_train).head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,3,3,1,2685.0,74
1,ES,171,207,9,21275.23,2


In [19]:
ci.transform(X_oot).head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
3495,SP,3,3,1,2685.0,133
3496,ES,178,209,9,21621.13,8


## Feature Scaling

In [20]:
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

std_scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
                                       variables = num_vars)

std_scaler.fit(X_train)
std_scaler.transform(X_train).head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,-0.31483,-0.320346,-0.439887,-0.088916,0.000132
1,ES,1.896072,2.070354,-0.062829,1.644549,-0.754429


In [21]:
std_scaler.transform(X_oot).head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
3495,SP,-0.31483,-0.320346,-0.439887,-0.088916,0.618452
3496,ES,1.988193,2.093792,-0.062829,1.676803,-0.691548


## Tratamento das Variáveis Categóricas

In [22]:
from feature_engine.encoding import OneHotEncoder

ohe = OneHotEncoder(variables=cat_vars)
ohe.fit(X_train)
ohe.transform(X_train).head(2)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
0,3,3,1,2685.0,74,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,171,207,9,21275.23,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
ohe.transform(X_oot).head(2)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
3495,3,3,1,2685.0,133,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3496,178,209,9,21621.13,8,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Reconstruindo as bases de treino e teste

In [24]:
# Variáveis numericas
std_scaler = SklearnTransformerWrapper(transformer = StandardScaler(), variables = num_vars)
std_scaler.fit(X_train)

# Variáveis categoricas
ohe = OneHotEncoder(variables=cat_vars)
ohe.fit(X_train)

In [25]:
X_train_transformado = std_scaler.transform(X_train)
X_train_transformado.head(2)

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,-0.31483,-0.320346,-0.439887,-0.088916,0.000132
1,ES,1.896072,2.070354,-0.062829,1.644549,-0.754429


In [26]:
X_train_transformado_final = ohe.transform(X_train_transformado)
X_train_transformado_final.head(2)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
0,-0.31483,-0.320346,-0.439887,-0.088916,0.000132,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.896072,2.070354,-0.062829,1.644549,-0.754429,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Reconstruindo os dados de OOT
X_oot_transformado = std_scaler.transform(X_oot)
X_oot_transformado_final = ohe.transform(X_oot_transformado)
X_oot_transformado_final.head(2)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
3495,-0.31483,-0.320346,-0.439887,-0.088916,0.618452,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3496,1.988193,2.093792,-0.062829,1.676803,-0.691548,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Treinando Regressão Logística

In [28]:
from sklearn.linear_model import LogisticRegression

# instanciar um modelo de regressão logística
lr_model = LogisticRegression(random_state=42)

In [29]:
# treinando o modelo na base de treino transformada
lr_model.fit(X_train_transformado_final, y_train)

In [30]:
lr_model.predict(X_train_transformado_final)

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_train, lr_model.predict(X_train_transformado_final))
acc_test = accuracy_score(y_oot, lr_model.predict(X_oot_transformado_final))

print(f'Acurácia Treino: {100 * acc_train:.2f}%')
print(f'Acurácia OOT: {100 * acc_test:.2f}%')

Acurácia Treino: 82.66%
Acurácia OOT: 82.55%
