In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading the Data

In [2]:
import os
import numpy as np
import pandas as pd

WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')

df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))

df_train = df_abt.query('data_ref_safra < "2018-03-01"')

df_oot = df_abt.query('data_ref_safra == "2018-03-01"')

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'
features = cat_vars + num_vars

# Dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# Dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

print('Conjunto de Dados:', X_train.shape)
print('Conjunto de Teste:', X_oot.shape)

Conjunto de Dados: (3495, 6)
Conjunto de Teste: (1874, 6)


# Treinando uma Regressão Logística

In [3]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/328.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m327.7/328.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [4]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

In [5]:
from sklearn.linear_model import LogisticRegression

lr_model_pipe = Pipeline(steps=[
                ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='median')),
                ('numeric_scaler', SklearnTransformerWrapper(variables=num_vars, transformer=StandardScaler())),
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')),
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('model', LogisticRegression(random_state=42))
])

## Avaliando o modelo na base de treino

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_val_score(lr_model_pipe, X_train, y_train, scoring='accuracy', cv=skf, n_jobs=-1)

print(cv_results)

print(cv_results.mean())
print(cv_results.std())

[0.82689557 0.81688126 0.81688126 0.81545064 0.84978541]
0.8251788268955652
0.012967403941201307


## Avaliando o modelo na base OOT

In [7]:
# Avaliando o modelo na base out of time
from sklearn.metrics import accuracy_score

lr_model_pipe.fit(X_train, y_train)

y_pred_train = lr_model_pipe.predict(X_train)
y_pred_oot  = lr_model_pipe.predict(X_oot)

acc_train = accuracy_score(y_train, y_pred_train)
acc_oot       = accuracy_score(y_oot, y_pred_oot)

print(f'Acurácia Treino: {100 * acc_train:.2f}%')
print(f'Acurácia Teste: {100 * acc_oot:.2f}%')

Acurácia Treino: 82.66%
Acurácia Teste: 82.55%
