**Imports**

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score
import pandas as pd
import sys

sys.path.append('C:\\Coding\\customer-churn-prediction\\src')
from utils import download_telco_churn_dataset, split_test_train
from preprocessing import get_preproc

RND_SEED = 42

**Getting Preprocessing and Data**

In [2]:
telco = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
X_train, X_test, y_train, y_test = split_test_train(telco)

preprocessing = get_preproc()

# Models

**Help Function**

Функция для оценки различных моделей на тестовой и тернировочных данных.

In [3]:
from sklearn.utils import class_weight

def fit_and_evaluate(models, cv=10):
    # Крафтим пайплайны для каждой модели
    pipelines = {}

    for name, model in models:
        pipelines[name] = (Pipeline([
            ("preproc", preprocessing),
            ("model", model)
        ]))
        
    metrics = {}

    # Оценивать модели будем по f1 score, но важна нам именно метрика recall для положительного класса
    for name, model in pipelines.items():
        # Исправляем дисбаланс классов
        classes_weights = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=y_train
        )
        model.fit(X_train, y_train, model__sample_weight=classes_weights)
        
        # Predictions on test and train
        print("\n" + f"Оценка {name}:")
        
        roc_auc_train = cross_val_score(
            model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
        
        print(f"TRAIN ROC-AUC: {roc_auc_train}")
        y_pred_test = model.predict(X_test)

        # Metrics evaluating
        roc_auc_test = roc_auc_score(y_test, y_pred_test)
        print(f"TEST ROC-AUC: {roc_auc_test}")

        # Confusion Matrix
        print("\n" + "🔍 МАТРИЦА ОШИБОК:")
        cm = confusion_matrix(y_test, y_pred_test)
        cm_df = pd.DataFrame(cm,
                             index=['Факт: 0', 'Факт: 1'],
                             columns=['Прогноз: 0', 'Прогноз: 1'])
        print(cm_df)
        
        metrics[name] = {
            'roc_auc_test': roc_auc_test,
            'roc_auc_train': roc_auc_train,
            'TP': cm[1][1],
            'FN': cm[1][0],
        }
    
    sorted_metrics = dict(sorted(metrics.items(), key=lambda item: -item[1]['roc_auc_train']))
    
    print(f"ТОП МОДЕЛЕЙ:")
    for name, metric in sorted_metrics.items():
        print("\n" + f"{name} : {metric['roc_auc_train']}")

## Models Comparation

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=RND_SEED,
    n_estimators=100
)

lr = LogisticRegression(random_state=RND_SEED, penalty='l2')

lin_svc = LinearSVC(C=1, random_state=RND_SEED)

gb = GradientBoostingClassifier(random_state=RND_SEED)

svc = SVC(C=1, random_state=RND_SEED)

rnd_forest = RandomForestClassifier(random_state=RND_SEED, n_jobs=-1)

xgb = XGBClassifier(random_state=RND_SEED, n_jobs=-1)

models = [
    ("Logistic Regression L2", lr),
    ("LightGBM", lgbm),
    ("RND Forest", rnd_forest),
    ("XGB Classifier", xgb),
    ("Gradient Boosting", gb),
    ("Linear SVC", lin_svc),
    ("SVC", svc),
]

In [5]:
fit_and_evaluate(models)


Оценка Logistic Regression L2:
TRAIN ROC-AUC: 0.8386088188144907
TEST ROC-AUC: 0.7583778966131908

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         759         276
Факт: 1          81         293
[LightGBM] [Info] Number of positive: 1495, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 887
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.734647 -> initscore=1.018328
[LightGBM] [Info] Start training from score 1.018328

Оценка LightGBM:
TRAIN ROC-AUC: 0.8099076480080569
TEST ROC-AUC: 0.7435699708078225

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         673         362
Факт: 1          61         313





Оценка RND Forest:
TRAIN ROC-AUC: 0.8132436323257733
TEST ROC-AUC: 0.6863894701490608

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         931         104
Факт: 1         197         177

Оценка XGB Classifier:
TRAIN ROC-AUC: 0.7980948500876963
TEST ROC-AUC: 0.7445826551964659

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         819         216
Факт: 1         113         261

Оценка Gradient Boosting:
TRAIN ROC-AUC: 0.821506052158837
TEST ROC-AUC: 0.756816244284275

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         753         282
Факт: 1          80         294

Оценка Linear SVC:
TRAIN ROC-AUC: 0.835851418810029
TEST ROC-AUC: 0.7541424474928311

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         753         282
Факт: 1          82         292

Оценка SVC:
TRAIN ROC-AUC: 0.7941406694025498
TEST ROC-AUC: 0.7651411816373455

🔍 МАТРИЦА ОШИБОК:
         Прогноз: 0  Прогноз: 1
Факт: 0         773         262
Факт: 1          81  

Непосредственно в этой задаче мне важен `Recall` на положительном класса, то есть процент верно определенных `Churn`.

## LR L2 + LightGBM