In [None]:
import pandas as pd

df = pd.read_csv('../data/abt_churn.csv')
df.head()

In [None]:
oot = df[df["dtRef"]==df['dtRef'].max()].copy()
oot.head() 

In [None]:
df_train = df[df['dtRef']<df['dtRef'].max()].copy()
df_train.head()

In [None]:
features = df_train.columns[2:-1]

target = 'flagChurn'

X, y = df_train[features], df_train[target]

Sample

In [None]:
from sklearn import model_selection 

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                                                                    random_state=42,
                                                                    test_size=0.2,
                                                                    stratify=y)

In [None]:
print("Taxa variável resposta no treino: %.2f%%" % (y_train.mean()*100))
print("Taxa variável resposta no treino: %.2f%%" % (y_test.mean()*100))

Explore

In [None]:
X_train.isna().sum().sort_values(ascending=False)

In [None]:
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [None]:
df_analise = X_train.copy()
df_analise[target] = y_train

summary = df_analise.groupby(by=target).agg(['mean', 'median']).T
summary

In [None]:
summary['diff_abs'] = summary[0] - summary[1]
summary['diff_rel'] = summary[0] / summary[1] 
summary = summary.sort_values(by='diff_rel', ascending=False)
summary.head()

In [None]:
from sklearn import tree

model_tree = tree.DecisionTreeClassifier(random_state=42,)

model_tree.fit(X_train, y_train)

In [None]:
feature_importances = (pd.Series(model_tree.feature_importances_, index=X_train.columns)
                       .sort_values(ascending=False)
                       .reset_index()
                       )

feature_importances['acum.'] = feature_importances[0].cumsum()
feature_importances[feature_importances['acum.'] < 0.96]

In [None]:
best_features = (feature_importances['acum.'] < 0.96).sum()
best_features

In [None]:
from feature_engine import discretisation 
from feature_engine import encoding

# modify

top_features = feature_importances.loc[:best_features-1, 'index'].tolist()

# discretisation 

tree_discretisation = discretisation.DecisionTreeDiscretiser(
    variables=top_features,
    regression=False,
    bin_output='bin_number',
    cv=3
)

#onehot

onehot = encoding.OneHotEncoder(variables=top_features, ignore_format=True)

In [None]:
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import ensemble

#model

# model = linear_model.LogisticRegression(penalty=None, max_iter=1000, random_state=42)
#model = naive_bayes.BernoulliNB()
model = ensemble.RandomForestClassifier(random_state=42,
                                        min_samples_leaf=20,
                                        n_jobs=-1,
                                        n_estimators=100
                                        )

In [None]:
from sklearn import pipeline
from sklearn import metrics


model_pipeline = pipeline.Pipeline(
    steps=[
        ('discretisation', tree_discretisation),
        ('onehot', onehot),
        ('logistic_regression', model),
    ]
)

import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment(experiment_name="churn_model_experiment")

with mlflow.start_run(run_name=model.__str__):
    mlflow.sklearn.autolog()
    model_pipeline.fit(X_train, y_train)

    y_train_predict = model_pipeline.predict(X_test)
    y_train_proba = model_pipeline.predict_proba(X_test)[:, 1]

    acc_train = metrics.accuracy_score(y_test, y_train_predict)
    auc_train = metrics.roc_auc_score(y_test, y_train_proba)
    roc_train = metrics.roc_curve(y_test, y_train_proba)

    print("Acurácia no treino: %.2f%%" % (acc_train*100))
    print("AUC no treino: %.2f%%" % (auc_train*100))

    y_test_predict = model_pipeline.predict(X_test)
    y_test_proba = model_pipeline.predict_proba(X_test)[:, 1]

    acc_test = metrics.accuracy_score(y_test, y_test_predict)
    auc_test = metrics.roc_auc_score(y_test, y_test_proba)
    roc_test = metrics.roc_curve(y_test, y_test_proba)

    print("Acurácia no teste: %.2f%%" % (acc_test*100))
    print("AUC no teste: %.2f%%" % (auc_test*100))

    y_oot_predict = model_pipeline.predict(oot[features])
    y_oot_proba = model_pipeline.predict_proba(oot[features])[:, 1]

    acc_oot = metrics.accuracy_score(oot[target], y_oot_predict)
    auc_oot = metrics.roc_auc_score(oot[target], y_oot_proba)
    roc_oot = metrics.roc_curve(oot[target], y_oot_proba)

    print("Acurácia no OOT: %.2f%%" % (acc_oot*100))
    print("AUC no OOT: %.2f%%" % (auc_oot*100))

    mlflow.log_metrics({
        "acc_train":acc_train,
        "acc_test":acc_test,
        "auc_test":auc_test,
        "acc_oot":acc_oot,
        "auc_oot":auc_oot,

    })



Acurácia no treino: 75.46%
AUC no treino: 82.68%




Acurácia no teste: 75.46%
AUC no teste: 82.68%




Acurácia no OOT: 76.90%
AUC no OOT: 84.15%
🏃 View run youthful-cod-516 at: http://127.0.0.1:5000/#/experiments/799564197647611794/runs/c2e2670c248249b1b9693603c9e56538
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/799564197647611794


In [None]:
import matplotlib.pyplot as plt

plt.plot(roc_train[0], roc_train[1])
plt.plot(roc_test[0], roc_test[1])
plt.plot(roc_oot[0], roc_oot[1])
plt.grid(True)
plt.title('ROC Curve')
plt.legend([
    f'Train: {auc_train:.2f}',
    f'Test: {auc_test:.2f}',
    f'OOT: {auc_oot:.2f}',
])