In [1]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


# Feature Selection

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Carregando o conjunto de dados

In [3]:
# carregando os pacotes
import os
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

WORK_DIR = '/content/drive/My Drive/datasets'
DATA_DIR = os.path.join(WORK_DIR, 'olist')
df_abt = pd.read_csv(os.path.join(DATA_DIR, 'propensao_revenda_abt.csv'))

# pega a base de treinamento
df_train = df_abt.query('data_ref_safra < "2018-03-01"')

# pega a base de avaliação (out of time)
df_oot   = df_abt.query('data_ref_safra == "2018-03-01"')



In [5]:
key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

In [6]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
    ('Random_Forest', RandomForestClassifier(max_depth=3, random_state=42))
])

data_pipe = rf[:-1]
X_transformed = data_pipe.fit_transform(X_train)

In [7]:
X_transformed.head()

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_ES,uf_GO,uf_PR,uf_MG,...,uf_MT,uf_AM,uf_RO,uf_CE,uf_BA,uf_SE,uf_MS,uf_PA,uf_MA,uf_PI
0,3,3,1,2685.0,74,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,171,207,9,21275.23,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38,42,15,781.8,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,120.0,16,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,130,141,75,16228.88,8,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Recursive Feature Elimination

Aqui vamos removendo a variável com menor importância e iterando novamente o treinamento do modelo, a cada passo removendo a variável com menor importância até atingirmos um número mínimo de features especificado.

In [8]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=rf[-1], n_features_to_select=2)

rfe.fit(rf[:-1].fit_transform(X_train), y_train)

In [9]:
rf[:-1].fit_transform(X_train).columns[rfe.get_support()]

Index(['tot_orders_12m', 'recencia'], dtype='object')