In [1]:
import pandas as pd

## Carrega a base do 1- Data_Prep_ABT

In [2]:
df_abt = pd.read_csv("/content/drive/MyDrive/Aula Fev Março Abri Maio 2024/03 - Aprendizagem Supervisionada - Classificação/Tópico 1 - Introdução/propensao_revenda_abt.csv")
df_abt.shape

(5369, 9)

In [None]:
df_train  = df_abt.query("data_ref_safra < '2018-03-01'")
df_oot  = df_abt.query("data_ref_safra == '2018-03-01'")

key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m','tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ["uf"]
target = ['nao_revendeu_next_6m']
features= cat_vars + num_vars

X_train = df_train[features]
y_train = df_train[target]

X_oot = df_oot[features]
y_oot = df_oot[target]

In [None]:
!pip install feature-engine



## Treina os modelos

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from feature_engine.imputation import ArbitraryNumberImputer, CategoricalImputer
from feature_engine.encoding import OneHotEncoder

In [None]:
dt= Pipeline(steps=[
    ("numeric_imputer", ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ("categoric_imputer", CategoricalImputer(variables=cat_vars,fill_value="missing")),
    ("one",OneHotEncoder(variables=cat_vars)),
    ("dt", DecisionTreeClassifier(random_state=42))
])

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf= Pipeline(steps=[
    ("numeric_imputer", ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ("categoric_imputer", CategoricalImputer(variables=cat_vars,fill_value="missing")),
    ("one",OneHotEncoder(variables=cat_vars)),
    ("rf", RandomForestClassifier(random_state=42))
])

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm= Pipeline(steps=[
    ("numeric_imputer", ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ("categoric_imputer", CategoricalImputer(variables=cat_vars,fill_value="missing")),
    ("one",OneHotEncoder(variables=cat_vars)),
    ("lg", LGBMClassifier(random_state=42))
])

## Avaliar o melhor modelo

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate

skf =StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

In [None]:
cv_results_dt= cross_validate(dt,X_train,y_train,scoring=["accuracy", "precision", "recall", "f1","roc_auc"], cv=skf,n_jobs=-1)

cv_results_dt= pd.DataFrame(cv_results_dt)
cv_results_dt.mean()

fit_time          0.081295
score_time        0.136538
test_accuracy     0.776252
test_precision    0.703995
test_recall       0.713213
test_f1           0.708302
test_roc_auc      0.764132
dtype: float64

In [None]:
cv_results_rf= cross_validate(rf,X_train,y_train,scoring=["accuracy", "precision", "recall", "f1","roc_auc"], cv=skf,n_jobs=-1)

cv_results_rf= pd.DataFrame(cv_results_rf)
cv_results_rf.mean()

fit_time          1.351390
score_time        0.403691
test_accuracy     0.836624
test_precision    0.798771
test_recall       0.763530
test_f1           0.780582
test_roc_auc      0.912172
dtype: float64

In [None]:
cv_results_lg= cross_validate(lgbm,X_train,y_train,scoring=["accuracy", "precision", "recall", "f1","roc_auc"], cv=skf,n_jobs=-1)

cv_results_lg= pd.DataFrame(cv_results_lg)
cv_results_lg.mean()

fit_time          0.339493
score_time        0.279775
test_accuracy     0.835479
test_precision    0.791366
test_recall       0.772535
test_f1           0.781464
test_roc_auc      0.907614
dtype: float64

In [None]:
from sklearn.metrics import accuracy_score, precision_score,f1_score,recall_score,roc_auc_score

## Treina com 100% o melhor modelo

In [None]:
#treinou em 100% de treino
rf.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [None]:
X_oot

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
3495,SP,3,3,1,2685.00,133
3496,ES,178,209,9,21621.13,8
3497,SP,44,48,20,1029.20,4
3498,GO,1,1,1,120.00,75
3499,SP,124,132,72,15104.92,12
...,...,...,...,...,...,...
5364,MG,4,4,3,124.60,12
5365,SP,5,5,5,385.59,0
5366,PR,11,12,8,1450.20,7
5367,SP,13,13,3,1709.87,0


In [None]:
rf.predict(X_oot)

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
previsoes_rf =rf.predict(X_oot)
previsoes_rf_proba =rf.predict_proba(X_oot)[:,1]

In [None]:
acc_rf = accuracy_score(y_oot,previsoes_rf)
recall_rf = recall_score(y_oot,previsoes_rf)
precision_rf= precision_score(y_oot,previsoes_rf)
f1_rf = f1_score(y_oot,previsoes_rf)
roc_rf = roc_auc_score(y_oot,previsoes_rf_proba)

In [None]:
print(f"Valor acc do rf {acc_rf}, precision {precision_rf}, recall {recall_rf}, f1 {f1_rf}, roc {roc_rf}")


Valor acc do rf 0.8671291355389541, precision 0.8390804597701149, recall 0.8099861303744799, f1 0.8242766407904023, roc 0.9229213304736


## Exporta o melhor modelo treinado

In [None]:
import joblib

In [None]:
joblib.dump(rf,"/content/drive/MyDrive/modelo_churn_20240326.pkl")

['/content/drive/MyDrive/modelo_churn_20240326.pkl']