In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import pandas as pd

WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')

# Loading the Data

In [3]:
df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))
df_abt.head()

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [4]:
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

df_oot = df_abt.query('data_ref_safra == "2018-03-01"')

# Identificar as varáveis para modelagem

In [5]:
key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'
features = cat_vars + num_vars

# Dados de treinamento somente com as variáveis numéricas
X_train = df_train[num_vars]
y_train = df_train[target]

# Dados de avaliação (out of time) somente com as variáveis numéricas
X_oot = df_oot[num_vars]
y_oot = df_oot[target]

In [6]:
print(X_train.shape, y_train.shape)

(3495, 5) (3495,)


In [7]:
print(X_oot.shape, y_oot.shape)

(1874, 5) (1874,)


# Treinando uma DecisionTree

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train, y_train)

## Avaliando o modelo na base de treino

In [11]:
acc_train = accuracy_score(y_train, dt_model.predict(X_train))
print(f'Acurácia Treino: {100 * acc_train:.2f}%')

Acurácia Treino: 99.97%


## Avaliando o modelo na base OOT

In [12]:
acc_oot = accuracy_score(y_oot, dt_model.predict(X_oot))
print(f'Acurácia OOT: {100 * acc_oot:.2f}%')

Acurácia OOT: 76.68%
