In [None]:
import os
import pickle
from collections import Counter

import imblearn
import lightgbm as lgb
import mlflow
import numpy as np
import pandas as pd
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC

In [7]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "secret.json"

### Load dataframe

In [6]:
df = pd.read_csv("data/telco-customers.csv")

In [11]:
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)

In [12]:
train.to_csv("data/train.csv", index=False)
test.to_csv("data/test.csv", index=False)

In [13]:
train.groupby("Churn").size()

Churn
No     4122
Yes    1512
dtype: int64

In [14]:
test.groupby("Churn").size()

Churn
No     1052
Yes     357
dtype: int64

In [15]:
ohe = OneHotEncoder(sparse=False)

In [16]:
num_features = ["tenure", "MonthlyCharges", "TotalCharges"]

cat_features = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
]

In [17]:
def prepare_dataset(df):
    df.TotalCharges = pd.to_numeric(df.TotalCharges, errors="coerce")
    df = df.iloc[:, 1:]
    df = df.dropna()
    df = df.reset_index(drop=True)

    df["Churn"] = df["Churn"].replace(to_replace="Yes", value=1)
    df["Churn"] = df["Churn"].replace(to_replace="No", value=0)

    df_dummies = ohe.fit_transform(df[cat_features])
    X = pd.DataFrame(df_dummies, columns=ohe.get_feature_names_out())
    X = pd.concat([X, df[num_features]], axis=1)
    y = df["Churn"].values

    features = X.columns.values
    scaler = MinMaxScaler(feature_range=(0, 1))

    X = pd.DataFrame(scaler.fit_transform(X))
    X.columns = features

    with open("app/models/min_max_scaler.bin", "wb") as f_out:
        pickle.dump(scaler, f_out)

    with open("app/models/ohe.bin", "wb") as f_out:
        pickle.dump(ohe, f_out)

    return X, y

In [18]:
X, y = prepare_dataset(train)

In [19]:
over = SMOTE(sampling_strategy=1)

X, y = over.fit_resample(X, y)
Counter(y)

Counter({1: 4115, 0: 4115})

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101, shuffle=True
)

### Training sklearn models

In [21]:
TRACKING_SERVER_HOST = "Your host address"

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("telco-customers-churn")

mlflow.sklearn.autolog()

In [23]:
def train_with_mlflow(model, model_name):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        recall = metrics.recall_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        roc = metrics.roc_auc_score(y_test, y_pred)

        mlflow.log_metrics(
            {
                "test_recall_score": recall,
                "test_f1_score": f1,
                "test_accuracy_score": accuracy,
                "test_roc_auc_score": roc,
            }
        )

        mlflow.log_param("model", model_name)

In [24]:
lr = LogisticRegression()
train_with_mlflow(lr, "logistic regression")

In [25]:
svc = SVC(kernel="linear")
train_with_mlflow(svc, "SVC")

In [26]:
ada = AdaBoostClassifier()
train_with_mlflow(ada, "AdaBoostClassifier")

### Training xgboost

In [27]:
space = {
    "n_estimators": hp.choice("n_estimators", np.arange(50, 200, dtype=int)),
    "max_depth": hp.choice("max_depth", np.arange(1, 20, dtype=int)),
    "min_samples_split": hp.uniform("min_samples_split", 0.1, 1.0),
    "min_samples_leaf": hp.uniform("min_samples_leaf", 0.1, 0.5),
}


def objective(params):
    with mlflow.start_run():
        clf = RandomForestClassifier(**params, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        recall = metrics.recall_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        roc = metrics.roc_auc_score(y_test, y_pred)

        mlflow.log_metrics(
            {
                "test_recall_score": recall,
                "test_f1_score": f1,
                "test_accuracy_score": accuracy,
                "test_roc_auc_score": roc,
            }
        )
        mlflow.log_param("model", "random_forest")
        return recall


best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    verbose=1,
    trials=Trials(),
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:04<00:00,  4.89s/trial, best loss: 0.0]


In [28]:
mlflow.xgboost.autolog()

In [29]:
space = {
    "n_estimators": hp.choice("n_estimators", np.arange(50, 200, dtype=int)),
    "max_depth": hp.choice("max_depth", np.arange(1, 20, dtype=int)),
    "learning_rate": hp.uniform("learning_rate", 0.001, 0.3),
    "min_child_weight": hp.uniform("min_child_weight", 1, 10),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "gamma": hp.uniform("gamma", 0, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
}


def objective(params):
    with mlflow.start_run():
        clf = xgb.XGBClassifier(**params, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        recall = metrics.recall_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        roc = metrics.roc_auc_score(y_test, y_pred)

        mlflow.log_metrics(
            {
                "test_recall_score": recall,
                "test_f1_score": f1,
                "test_accuracy_score": accuracy,
                "test_roc_auc_score": roc,
            }
        )

        mlflow.log_param("model", "xgboost")
        return recall

In [None]:
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=500,
    verbose=1,
    trials=Trials(),
)

### Training LightGBM

In [31]:
mlflow.lightgbm.autolog()

In [None]:
space = {
    "n_estimators": hp.choice("n_estimators", np.arange(50, 200, dtype=int)),
    "max_depth": hp.choice("max_depth", np.arange(1, 20, dtype=int)),
    "learning_rate": hp.uniform("learning_rate", 0.001, 0.3),
    "min_child_weight": hp.uniform("min_child_weight", 1, 10),
    "num_leaves": hp.choice("num_leaves", np.arange(2, 100, dtype=int)),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
}


def objective(params):
    with mlflow.start_run():
        clf = lgb.LGBMClassifier(**params, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        recall = metrics.recall_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        roc = metrics.roc_auc_score(y_test, y_pred)

        mlflow.log_metrics(
            {
                "test_recall_score": recall,
                "test_f1_score": f1,
                "test_accuracy_score": accuracy,
                "test_roc_auc_score": roc,
            }
        )

        mlflow.log_param("model", "lightgbm")
        return recall


# Assuming you have X_train, X_test, y_train, and y_test datasets available

# Trials object to track the results
trials = Trials()

# Run the hyperparameter search using TPE (Tree-structured Parzen Estimator) algorithm
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=500,  # Number of iterations for hyperparameter search
    trials=trials,
    verbose=1,
)