### Churn Prediction

#### Dataset: Telco: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

#### Model selection.

From the previous notebook (churn_predicion.ipynb) we saw that logistic regression outperformed random forest using a standard train test splitting and default model parameters. In this notebook we are gonna compare logistic regression with RF, but changing its hyperparameters. The hyperparameters search will be performed employing bayesian optimization and the metrics will be recorded via MLFlow.

In [1]:
import joblib
import mlflow
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, space_eval, tpe
from hyperopt.pyll.base import scope
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

In [2]:
def preprocess():
    """Preprocessing of DataFrame returning cleaned selected variables"""

    df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

    df.TotalCharges.replace(" ", np.nan, inplace=True)
    df.TotalCharges = df.TotalCharges.astype(float)
    df.dropna(inplace=True)
    df.Churn = (df.Churn == "Yes").astype(int)
    sel_vars = [
        "Contract",
        "OnlineSecurity",
        "TechSupport",
        "OnlineBackup",
        "InternetService",
        "MonthlyCharges",
        "TotalCharges",
        "tenure",
    ]
    df_sel = df[sel_vars]
    dic_df = df_sel.to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(dic_df)  # Returns an np array
    y = df.Churn  # Pandas series, anyway the classifier can deal with it

    return X, y

In [3]:
X, y = preprocess()

### Model evaluation strategy:
* 25% hold-out set
* 75% CV 5-fold for hyperparameters search

In [4]:
KF = 5
TEST_SIZE = 0.25
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=0, stratify=y
)

In [5]:
# Hyperparams for bayes optimization with hyperopt
lr_space = {
    "penalty": hp.choice("penalty", ["l1", "l2"]),
    "C": hp.uniform("C", 0, 10),
}

In [6]:
rf_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 3, 18, 1)),
    "n_estimators": scope.int(hp.quniform("n_estimators", 80, 320, 20)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 3, 13, 2)),
    "random_state": scope.int(hp.choice("random_state", [22, 44, 66])),
}

In [7]:
def objective_lr(space, kf=KF):

    clf = LogisticRegression(solver="liblinear", C=space["C"], penalty=space["penalty"])

    cv = StratifiedKFold(random_state=22, n_splits=KF, shuffle=True)
    score = cross_val_score(
        clf, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1
    ).mean()

    loss = -score
    print(f"Accuracy: {score}")

    return {"loss": loss, "params": space, "status": STATUS_OK}

In [8]:
def objective_rf(space, kf=KF):

    clf = RandomForestClassifier(**space)

    cv = StratifiedKFold(random_state=22, n_splits=KF, shuffle=True)
    score = cross_val_score(
        clf, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1
    ).mean()

    loss = -score
    print(f"Accuracy: {score}")

    return {"loss": loss, "params": space, "status": STATUS_OK}

In [9]:
mlflow.set_experiment("Hyperparams_Search")

2022/11/01 01:03:48 INFO mlflow.tracking.fluent: Experiment with name 'Hyperparams_Search' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/saul/telco-churn-pred/notebooks/mlruns/1', creation_time=1667286228901, experiment_id='1', last_update_time=1667286228901, lifecycle_stage='active', name='Hyperparams_Search', tags={}>

In [10]:
trials_lr = Trials()
with mlflow.start_run(run_name="hypersearch_lr") as run:

    best = fmin(
        fn=objective_lr,
        space=lr_space,
        algo=tpe.suggest,
        max_evals=12,
        trials=trials_lr,
        rstate=np.random.default_rng(789),
    )
    best_lr = space_eval(lr_space, best)
    mlflow.log_dict(best_lr, "best_lr_params.json")
    mlflow.log_params(best_lr)
    mlflow.log_metric("best_acc", -trials_lr.best_trial["result"]["loss"])

Accuracy: 0.7935152926787593                          
Accuracy: 0.7925661663534088                                                     
Accuracy: 0.7925661663534088                                                     
Accuracy: 0.7925661663534088                                                     
Accuracy: 0.7937048661384749                                                     
Accuracy: 0.7946527334370531                                                     
Accuracy: 0.7925661663534088                                                     
Accuracy: 0.7948423068967687                                                     
Accuracy: 0.7925661663534088                                                     
Accuracy: 0.7933257192190437                                                     
Accuracy: 0.7937048661384749                                                      
Accuracy: 0.7925661663534088                                                      
100%|██████████| 12/12 [00:04<00:00,  2.8

In [11]:
trials_rf = Trials()
with mlflow.start_run(run_name="hypersearch_rf") as run:

    best = fmin(
        fn=objective_rf,
        space=rf_space,
        algo=tpe.suggest,
        max_evals=12,
        trials=trials_rf,
        rstate=np.random.default_rng(789),
    )
    best_rf = space_eval(rf_space, best)
    mlflow.log_dict(best_rf, "best_rf_params.json")
    mlflow.log_params(best_rf)
    mlflow.log_metric("best_acc", -trials_rf.best_trial["result"]["loss"])

Accuracy: 0.7918078725145463                          
Accuracy: 0.7954106675539807                                                     
Accuracy: 0.795409228666241                                                      
Accuracy: 0.7944620808115327                                                     
Accuracy: 0.7916170400280583                                                     
Accuracy: 0.7935142135129545                                                     
Accuracy: 0.7916179393328957                                                     
Accuracy: 0.7916186587767656                                                     
Accuracy: 0.7946521938541506                                                     
Accuracy: 0.7938928208494833                                                     
Accuracy: 0.7944615412286302                                                      
Accuracy: 0.7938933604323858                                                      
100%|██████████| 12/12 [00:35<00:00,  2.9

In [12]:
# Best accuracy Log Reg:
-trials_lr.best_trial["result"]["loss"]

0.7948423068967687

In [13]:
# Best accuracy RF:
-trials_rf.best_trial["result"]["loss"]

0.7954106675539807

#### Metrics with the hold-out set:

#### Log reg

In [14]:
lr = LogisticRegression(**best_lr, solver="liblinear")

In [15]:
lr.fit(X_train, y_train)

In [16]:
y_pred = lr.predict(X_test)

In [17]:
# Accuracy hold out set
accuracy_score(y_pred, y_test)

0.8048919226393629

In [18]:
# f1-score
f1_score(y_pred, y_test)

0.5940828402366864

In [19]:
# roc_auc
roc_auc_score(y_pred, y_test)

0.7537497124453647

RF

In [20]:
rf = RandomForestClassifier(**best_rf)

In [21]:
rf.fit(X_train, y_train)

In [22]:
y_pred = rf.predict(X_test)

In [23]:
# Accuracy hold out set
accuracy_score(y_pred, y_test)

0.7974971558589306

In [24]:
# f1-score
f1_score(y_pred, y_test)

0.5658536585365854

In [25]:
# roc_auc
roc_auc_score(y_pred, y_test)

0.744982004778563

### Model selected:
#### Logistic Regression

In [27]:
joblib.dump(lr,'../model/churn_prediction')

['../model/churn_prediction']