In [404]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, make_scorer, accuracy_score, precision_score, recall_score
import xgboost as xgb

In [405]:
dados_tratados = "../data_sampling/dados_tratados/%s"

In [406]:
df = pd.read_parquet(dados_tratados%"df_woe.parquet")

In [407]:
y = pd.read_parquet(dados_tratados%"df.parquet")["resp"].values

In [408]:
X = df.copy()

In [409]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [410]:
f_scorer = make_scorer(fbeta_score, beta=.5)

In [411]:
parameters = {"solver":["saga"], 'C':[100,10,1, .1,.01,0.001],"penalty": ["l1","l2"]}

In [412]:
unselect = [
    # "StreamingTV",
    # "gender",
    # "DeviceProtection",
    # "StreamingMovies",
    # "OnlineBackup",
    # "Partner",
    # "PhoneService",
    # "MonthlyCharges"
    ]

In [413]:
model = LogisticRegression(random_state=42)

In [414]:
selector = RFE(model, n_features_to_select=.2, step=1,)

In [415]:
selector.fit(X_train.drop(unselect, axis=1), y_train)

In [416]:
sele = X_train.columns[selector.support_]

In [417]:
model = LogisticRegression(random_state=42)
gs = GridSearchCV(
    model, 
    parameters, 
    cv=10,
    n_jobs=-1,
    scoring=f_scorer,
    verbose = 1)
gs.fit(X_train[sele],y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


In [418]:
model = LogisticRegression(**gs.best_params_)
model.fit(X_train[sele], y_train)

In [419]:
gs.best_params_

{'C': 100, 'penalty': 'l1', 'solver': 'saga'}

In [420]:
pd.DataFrame({
    "beta": model.coef_[0], 
    "var": model.feature_names_in_}).sort_values(by="beta", ascending=False)

Unnamed: 0,beta,var
2,5.612889,MultipleLines
0,1.225903,tenure
1,-4.056513,PhoneService


In [423]:
pred = model.predict_proba(X_test[sele])
res = pd.DataFrame({"pred":pred[:,1],"resp":y_test})
res["cut"] = pd.qcut(res["pred"],5)
group = res.groupby("cut")
total_fraud = res["resp"].sum()
out = []
acum = 0
percent = 100
for i, g in group:
    acum = acum + g["resp"].sum()
    parc = pd.DataFrame({
        "cut": [i], 
        "Percent": [percent], 
        "Churn":  ["{:.2f}%".format((1-(acum/total_fraud))*100)],
        "N": [len(g)],
        "Qt_churn": [g["resp"].sum()]})
    percent -= 5
    out.append(parc)

  group = res.groupby("cut")


In [424]:
pd.concat(out)

Unnamed: 0,cut,Percent,Churn,N,Qt_churn
0,"(0.0289, 0.115]",100,86.40%,516,51
0,"(0.115, 0.158]",95,84.27%,55,8
0,"(0.158, 0.266]",90,62.40%,357,82
0,"(0.266, 0.427]",85,31.47%,275,116
0,"(0.427, 0.676]",80,0.00%,206,118


In [425]:
y_predict = []
for x in pred[:,1]:
    if x <   0.427:
        y_predict.append(0)
    else:
        y_predict.append(1)

In [426]:
pd.crosstab(y_test, y_predict, margins="All", rownames=["Verdadeiro"], colnames=["Predito"])

Predito,0,1,All
Verdadeiro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,800,234,1034
1,142,233,375
All,942,467,1409


In [427]:
display("Accuracy",accuracy_score(y_test,y_predict))
display("Precision", precision_score(y_test,y_predict))
display("Recall", recall_score(y_test,y_predict))

'Accuracy'

0.7331440738112136

'Precision'

0.4989293361884368

'Recall'

0.6213333333333333

In [395]:
model = xgb.XGBClassifier(random_state=0)

In [396]:
selector = RFE(model, n_features_to_select=.5, step=1,)
selector.fit(X_train, y_train)
sele = X_train.columns[selector.support_]

In [397]:
parameters = {
    "n_estimators": [350,650,750],
    "max_depth": [5,30], 
    "learning_rate": [0.01],
    "subsample": [.2,.5, 1],
}

In [398]:
model = xgb.XGBClassifier(
    random_state=0)
gs = GridSearchCV(
    model, 
    parameters, 
    cv=10,
    n_jobs=-1,
    scoring=f_scorer,
    verbose = 1)
gs.fit(X_train[sele],y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


In [401]:
model_xg = xgb.XGBClassifier(**gs.best_params_)
model_xg.fit(X_train[sele], y_train)

In [402]:
pred = model_xg.predict_proba(X_test[sele])
res = pd.DataFrame({"pred":pred[:,1],"resp":y_test})
res["cut"] = pd.cut(res["pred"],20)
group = res.groupby("cut")
total_fraud = res["resp"].sum()
out = []
acum = 0
percent = 95
for i, g in group:
    acum = acum + g["resp"].sum()
    parc = pd.DataFrame({
        "cut": [i], 
        "Percent": [percent], 
        "Fraud": ["{:.2f}%".format((1-(acum/total_fraud))*100)],
        "N": [len(g["resp"])],
        "Qt_fraud": [g["resp"].sum()]})
    percent -= 5
    out.append(parc)

  group = res.groupby("cut")


In [403]:
pd.concat(out)

Unnamed: 0,cut,Percent,Fraud,N,Qt_fraud
0,"(0.0169, 0.0598]",95,98.13%,373,7
0,"(0.0598, 0.102]",90,94.40%,169,14
0,"(0.102, 0.144]",85,92.27%,51,8
0,"(0.144, 0.186]",80,88.80%,84,13
0,"(0.186, 0.228]",75,84.27%,80,17
0,"(0.228, 0.27]",70,75.20%,106,34
0,"(0.27, 0.312]",65,73.33%,21,7
0,"(0.312, 0.354]",60,69.07%,59,16
0,"(0.354, 0.396]",55,64.53%,65,17
0,"(0.396, 0.438]",50,62.67%,18,7


In [290]:
y_predict = []
for x in pred[:,1]:
    if x < 0.79:
        y_predict.append(0)
    else:
        y_predict.append(1)

In [272]:
pd.crosstab(y_test, y_predict, margins="All", rownames=["Verdadeiro"], colnames=["Predito"])

Predito,0,1,All
Verdadeiro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,637,397,1034
1,48,327,375
All,685,724,1409


In [273]:
display("Accuracy",accuracy_score(y_test,y_predict))
display("Precision", precision_score(y_test,y_predict))
display("Recall", recall_score(y_test,y_predict))

'Accuracy'

0.6841731724627396

'Precision'

0.4516574585635359

'Recall'

0.872

In [235]:
import pickle
with open('../notebooks/artefatos/model_lg.pickle', 'wb') as handle:
    pickle.dump(model_lg, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../notebooks/artefatos/model_xg.pickle', 'wb') as handle:
    pickle.dump(model_xg, handle, protocol=pickle.HIGHEST_PROTOCOL)