**Imports**

In [56]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score
import pandas as pd
import sys

sys.path.append('C:\\Coding\\customer-churn-prediction\\src')
from utils import download_telco_churn_dataset, split_test_train
from preprocessing import get_preproc

RND_SEED = 42

**Getting Preprocessing and Data**

In [57]:
telco = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
X_train, X_test, y_train, y_test = split_test_train(telco)

preprocessing = get_preproc()

# Models

**Help Function**

Функция для оценки различных моделей на тестовой и тернировочных данных.

In [58]:
from sklearn.utils import class_weight

def fit_and_evaluate(models, cv=10):
    # Крафтим пайплайны для каждой модели
    pipelines = {}

    for name, model in models:
        pipelines[name] = (Pipeline([
            ("preproc", preprocessing),
            ("model", model)
        ]))
        
    metrics = {}

    # Оценивать модели будем по f1 score, но важна нам именно метрика recall для положительного класса
    for name, model in pipelines.items():
        # Исправляем дисбаланс классов
        classes_weights = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=y_train
        )
        model.fit(X_train, y_train, model__sample_weight=classes_weights)
        
        # Predictions on test and train
        print("\n" + f"Оценка {name}:")
        
        roc_auc_train = cross_val_score(
            model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
        f1_cv = cross_val_score(
            model, X_test, y_test, scoring='f1', cv=cv, n_jobs=-1).mean()
        
        # print(f"TRAIN ROC-AUC: {roc_auc_train}")
        y_pred_test = model.predict(X_test)

        # Metrics evaluating
        roc_auc_test = roc_auc_score(y_test, y_pred_test)
        # print(f"TEST ROC-AUC: {roc_auc_test}")

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred_test)
        cm_df = pd.DataFrame(cm,
                             index=['Факт: 0', 'Факт: 1'],
                             columns=['Прогноз: 0', 'Прогноз: 1'])
        print(cm_df)
        
        metrics[name] = {
            'roc_auc_test': roc_auc_test,
            'roc_auc_cv': roc_auc_train,
            'f1_cv': f1_cv,
        }
    
    sorted_metrics = dict(sorted(metrics.items(), key=lambda item: -item[1]['f1_cv']))
    
    print(f"ТОП МОДЕЛЕЙ:")
    for name, metric in sorted_metrics.items():
        print("\n" + f"{name} : \nF1: {metric['f1_cv']}\nAUC: {metric['roc_auc_cv']}")

## Models Comparation

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=RND_SEED, verbose=-1)

lr = LogisticRegression(random_state=RND_SEED, penalty='l2')

lin_svc = LinearSVC(C=1, random_state=RND_SEED)

gb = GradientBoostingClassifier(random_state=RND_SEED)

svc = SVC(C=1, random_state=RND_SEED)

rnd_forest = RandomForestClassifier(random_state=RND_SEED, n_jobs=-1)

xgb = XGBClassifier(random_state=RND_SEED, n_jobs=-1)

models = [
    ("Logistic Regression L2", lr),
    ("LightGBM", lgbm),
    ("RND Forest", rnd_forest),
    ("XGB Classifier", xgb),
    ("Gradient Boosting", gb),
    ("Linear SVC", lin_svc),
    ("SVC", svc),
]

In [60]:
fit_and_evaluate(models)


Оценка Logistic Regression L2:
         Прогноз: 0  Прогноз: 1
Факт: 0         759         276
Факт: 1          81         293

Оценка LightGBM:




         Прогноз: 0  Прогноз: 1
Факт: 0         794         241
Факт: 1          91         283

Оценка RND Forest:
         Прогноз: 0  Прогноз: 1
Факт: 0         931         104
Факт: 1         197         177

Оценка XGB Classifier:
         Прогноз: 0  Прогноз: 1
Факт: 0         819         216
Факт: 1         113         261

Оценка Gradient Boosting:
         Прогноз: 0  Прогноз: 1
Факт: 0         753         282
Факт: 1          80         294

Оценка Linear SVC:
         Прогноз: 0  Прогноз: 1
Факт: 0         753         282
Факт: 1          82         292

Оценка SVC:
         Прогноз: 0  Прогноз: 1
Факт: 0         773         262
Факт: 1          81         293
ТОП МОДЕЛЕЙ:

Gradient Boosting : 
F1: 0.5574284635465031
AUC: 0.821506052158837

RND Forest : 
F1: 0.5563693061786401
AUC: 0.8132436323257733

Logistic Regression L2 : 
F1: 0.5549720188962309
AUC: 0.8386088188144907

Linear SVC : 
F1: 0.5539445671142795
AUC: 0.835851418810029

SVC : 
F1: 0.5389385604088957
AUC: 0.7941

Непосредственно в этой задаче мне важен `Recall` на положительном класса, то есть процент верно определенных `Churn`.

## Lin SVC + GB + Logist Regr

In [61]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import loguniform, uniform, randint


def fine_tuning_models(models_data):
    for name, model_data in models_data.items():
        models_data[name] = {
            "pipeline": Pipeline([
                ("preproc", preprocessing),
                ("model", model_data["model"]),
            ]),
            "param_distrib": model_data["param_disturb"],
        }

        f1_losses = cross_val_score(
            models_data[name]["pipeline"], X_train, y_train, cv=10, n_jobs=-1, scoring='f1')
        print(f'{name} f1:\n{pd.Series(f1_losses).mean()}')
        
    # Исправляем дисбаланс классов
    classes_weights = class_weight.compute_sample_weight(
        class_weight='balanced',
        y=y_train
    )
    
    print('=' * 50 + "Tuned models!!!" + '=' * 50)
    
    best_models = {}
    
    for name, model_data in models_data.items():
        rnd_search = RandomizedSearchCV(model_data["pipeline"], param_distributions=model_data["param_distrib"], n_iter=50, cv=5, n_jobs=-1, random_state=RND_SEED)
        rnd_search.fit(X_train, y_train, model__sample_weight=classes_weights)

        f1_losses_cv = cross_val_score(rnd_search.best_estimator_, X_train, y_train, cv=10, n_jobs=-1, scoring='f1')
        
        print(f'{name} f1:\n{pd.Series(f1_losses_cv).mean()}')
        
        best_models[name] = rnd_search.best_estimator_
        
    return best_models
        
    

In [62]:
pd_lin_svc = {
    "model__tol": uniform(1e-6, 1e-4),
    "model__C": loguniform(0.1, 10),
    "model__fit_intercept": [True, False],
    "model__intercept_scaling": loguniform(1, 10),
}

pd_gb = {
    "model__learning_rate": loguniform(0.05, 0.1),
    "model__n_estimators": randint(100, 200),
    "model__max_depth": randint(3, 5),
    "model__max_features": ['sqrt', 'log2', None],
}

pd_lr = [
    {
        "model__penalty": ['l2'],
        "model__tol": uniform(1e-6, 1e-4),
        "model__C": loguniform(0.5, 10),
        "model__max_iter": randint(50, 500),
        "model__solver": ['lbfgs'],
    },
    {
        "model__penalty": ['l2', 'l1'],
        "model__tol": uniform(1e-6, 1e-4),
        "model__C": loguniform(0.5, 10),
        "model__max_iter": randint(50, 500),
        "model__solver": ['liblinear'],
    }
]

models_data = {
    "Logistic Regression": {
        "model": lr,
        "param_disturb": pd_lr,
    },
    "Gradient Boosting": {
        "model": gb,
        "param_disturb": pd_gb,
    },
    "Linear SVC": {
        "model": lin_svc,
        "param_disturb": pd_lin_svc,
    },
}

In [63]:
best_models = fine_tuning_models(models_data)

Logistic Regression f1:
0.5929170175962092
Gradient Boosting f1:
0.5813504460806895
Linear SVC f1:
0.5687812406106663
Logistic Regression f1:
0.5924823663382313
Gradient Boosting f1:
0.5858352880636593
Linear SVC f1:
0.573080332393418


Не улучшилось - выбираем `Logistic Regression`.