**Imports**

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score
import pandas as pd
import sys

sys.path.append('C:\\Coding\\customer-churn-prediction\\src')
from utils import download_telco_churn_dataset, split_test_train
from preprocessing import get_preproc

RND_SEED = 42

**Getting Preprocessing and Data**

In [2]:
telco = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
X_train, X_test, y_train, y_test = split_test_train(telco)

preprocessing = get_preproc()

# Models

**Help Function**

–§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ—Ü–µ–Ω–∫–∏ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –∏ —Ç–µ—Ä–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö.

In [3]:
from sklearn.utils import class_weight

def fit_and_evaluate(models, cv=10):
    # –ö—Ä–∞—Ñ—Ç–∏–º –ø–∞–π–ø–ª–∞–π–Ω—ã –¥–ª—è –∫–∞–∂–¥–æ–π –º–æ–¥–µ–ª–∏
    pipelines = {}

    for name, model in models:
        pipelines[name] = (Pipeline([
            ("preproc", preprocessing),
            ("model", model)
        ]))
        
    metrics = {}

    # –û—Ü–µ–Ω–∏–≤–∞—Ç—å –º–æ–¥–µ–ª–∏ –±—É–¥–µ–º –ø–æ f1 score, –Ω–æ –≤–∞–∂–Ω–∞ –Ω–∞–º –∏–º–µ–Ω–Ω–æ –º–µ—Ç—Ä–∏–∫–∞ recall –¥–ª—è –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–≥–æ –∫–ª–∞—Å—Å–∞
    for name, model in pipelines.items():
        # –ò—Å–ø—Ä–∞–≤–ª—è–µ–º –¥–∏—Å–±–∞–ª–∞–Ω—Å –∫–ª–∞—Å—Å–æ–≤
        classes_weights = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=y_train
        )
        model.fit(X_train, y_train, model__sample_weight=classes_weights)
        
        # Predictions on test and train
        print("\n" + f"–û—Ü–µ–Ω–∫–∞ {name}:")
        
        roc_auc_train = cross_val_score(
            model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
        
        print(f"TRAIN ROC-AUC: {roc_auc_train}")
        y_pred_test = model.predict(X_test)

        # Metrics evaluating
        roc_auc_test = roc_auc_score(y_test, y_pred_test)
        print(f"TEST ROC-AUC: {roc_auc_test}")

        # Confusion Matrix
        print("\n" + "üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:")
        cm = confusion_matrix(y_test, y_pred_test)
        cm_df = pd.DataFrame(cm,
                             index=['–§–∞–∫—Ç: 0', '–§–∞–∫—Ç: 1'],
                             columns=['–ü—Ä–æ–≥–Ω–æ–∑: 0', '–ü—Ä–æ–≥–Ω–æ–∑: 1'])
        print(cm_df)
        
        metrics[name] = {
            'roc_auc_test': roc_auc_test,
            'roc_auc_train': roc_auc_train,
            'TP': cm[1][1],
            'FN': cm[1][0],
        }
    
    sorted_metrics = dict(sorted(metrics.items(), key=lambda item: -item[1]['roc_auc_train']))
    
    print(f"–¢–û–ü –ú–û–î–ï–õ–ï–ô:")
    for name, metric in sorted_metrics.items():
        print("\n" + f"{name} : {metric['roc_auc_train']}")

## Models Comparation

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=RND_SEED,
    n_estimators=100
)

lr = LogisticRegression(random_state=RND_SEED, penalty='l2')

lin_svc = LinearSVC(C=1, random_state=RND_SEED)

gb = GradientBoostingClassifier(random_state=RND_SEED)

svc = SVC(C=1, random_state=RND_SEED)

rnd_forest = RandomForestClassifier(random_state=RND_SEED, n_jobs=-1)

xgb = XGBClassifier(random_state=RND_SEED, n_jobs=-1)

models = [
    ("Logistic Regression L2", lr),
    ("LightGBM", lgbm),
    ("RND Forest", rnd_forest),
    ("XGB Classifier", xgb),
    ("Gradient Boosting", gb),
    ("Linear SVC", lin_svc),
    ("SVC", svc),
]

In [5]:
fit_and_evaluate(models)


–û—Ü–µ–Ω–∫–∞ Logistic Regression L2:
TRAIN ROC-AUC: 0.8386088188144907
TEST ROC-AUC: 0.7583778966131908

üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:
         –ü—Ä–æ–≥–Ω–æ–∑: 0  –ü—Ä–æ–≥–Ω–æ–∑: 1
–§–∞–∫—Ç: 0         759         276
–§–∞–∫—Ç: 1          81         293
[LightGBM] [Info] Number of positive: 1495, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 887
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.734647 -> initscore=1.018328
[LightGBM] [Info] Start training from score 1.018328

–û—Ü–µ–Ω–∫–∞ LightGBM:
TRAIN ROC-AUC: 0.8099076480080569
TEST ROC-AUC: 0.7435699708078225

üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:
         –ü—Ä–æ–≥–Ω–æ–∑: 0  –ü—Ä–æ–≥–Ω–æ–∑: 1
–§–∞–∫—Ç: 0       




–û—Ü–µ–Ω–∫–∞ RND Forest:
TRAIN ROC-AUC: 0.8132436323257733
TEST ROC-AUC: 0.6863894701490608

üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:
         –ü—Ä–æ–≥–Ω–æ–∑: 0  –ü—Ä–æ–≥–Ω–æ–∑: 1
–§–∞–∫—Ç: 0         931         104
–§–∞–∫—Ç: 1         197         177

–û—Ü–µ–Ω–∫–∞ XGB Classifier:
TRAIN ROC-AUC: 0.7980948500876963
TEST ROC-AUC: 0.7445826551964659

üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:
         –ü—Ä–æ–≥–Ω–æ–∑: 0  –ü—Ä–æ–≥–Ω–æ–∑: 1
–§–∞–∫—Ç: 0         819         216
–§–∞–∫—Ç: 1         113         261

–û—Ü–µ–Ω–∫–∞ Gradient Boosting:
TRAIN ROC-AUC: 0.821506052158837
TEST ROC-AUC: 0.756816244284275

üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:
         –ü—Ä–æ–≥–Ω–æ–∑: 0  –ü—Ä–æ–≥–Ω–æ–∑: 1
–§–∞–∫—Ç: 0         753         282
–§–∞–∫—Ç: 1          80         294

–û—Ü–µ–Ω–∫–∞ Linear SVC:
TRAIN ROC-AUC: 0.835851418810029
TEST ROC-AUC: 0.7541424474928311

üîç –ú–ê–¢–†–ò–¶–ê –û–®–ò–ë–û–ö:
         –ü—Ä–æ–≥–Ω–æ–∑: 0  –ü—Ä–æ–≥–Ω–æ–∑: 1
–§–∞–∫—Ç: 0         753         282
–§–∞–∫—Ç: 1          82         292

–û—Ü–µ–

–ù–µ–ø–æ—Å—Ä–µ–¥—Å—Ç–≤–µ–Ω–Ω–æ –≤ —ç—Ç–æ–π –∑–∞–¥–∞—á–µ –º–Ω–µ –≤–∞–∂–µ–Ω `Recall` –Ω–∞ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–º –∫–ª–∞—Å—Å–∞, —Ç–æ –µ—Å—Ç—å –ø—Ä–æ—Ü–µ–Ω—Ç –≤–µ—Ä–Ω–æ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—ã—Ö `Churn`.

## LR L2 + LightGBM