In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import warnings

In [2]:
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    PrecisionRecallDisplay,
    average_precision_score
)

In [4]:
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message="The reported value is ignored because"
)

In [5]:
DATA_DIR = '../data/processed/'
X_TRAIN_PATH = os.path.join(DATA_DIR, 'X_train_processed.csv')
X_TEST_PATH = os.path.join(DATA_DIR, 'X_test_processed.csv')
Y_TRAIN_PATH = os.path.join(DATA_DIR, 'y_train.csv')
Y_TEST_PATH = os.path.join(DATA_DIR, 'y_test.csv')

In [6]:
OPTUNA_DB_URL = 'sqlite:///optuna_lgbm_churn.db'
STUDY_NAME = 'lgbm_churn_optimization_v1'
N_TRIALS = 1000
N_SPLITS_CV = 5
EARLY_STOPPING_ROUNDS = 50
RANDOM_SEED = 42

In [7]:
MODEL_DIR = '../models/'
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_DIR, 'lgbm_churn_model.joblib')

In [8]:
print("--- Загрузка обработанных данных ---")
try:
    X_train = pd.read_csv(X_TRAIN_PATH)
    X_test = pd.read_csv(X_TEST_PATH)
    y_train = pd.read_csv(Y_TRAIN_PATH).squeeze('columns')
    y_test = pd.read_csv(Y_TEST_PATH).squeeze('columns')

    if not isinstance(y_train, pd.Series):
        y_train = y_train[y_train.columns[0]]
    if not isinstance(y_test, pd.Series):
        y_test = y_test[y_test.columns[0]]

    print("Данные успешно загружены.")
    print(f"Размер X_train: {X_train.shape}")
    print(f"Размер X_test: {X_test.shape}")
    print(f"Тип y_train: {type(y_train)}")

except FileNotFoundError as e:
    print(f"Ошибка: Не найден файл. Проверьте пути. {e}")
except Exception as e:
    print(f"Произошла ошибка при загрузке данных: {e}")

--- Загрузка обработанных данных ---
Данные успешно загружены.
Размер X_train: (8000, 14)
Размер X_test: (2000, 14)
Тип y_train: <class 'pandas.core.series.Series'>


In [9]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """Оценивает модель и выводит основные метрики."""
    print(f"\n--- Оценка модели: {model_name} ---")
    try:
        if hasattr(model, "predict_proba"):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            roc_auc = roc_auc_score(y_test, y_pred_proba)
            pr_auc = average_precision_score(y_test, y_pred_proba)
            print(f"ROC AUC: {roc_auc:.4f}")
            print(f"PR AUC (Average Precision): {pr_auc:.4f}")
        else:
            y_pred_proba = None
            roc_auc = None
            pr_auc = None
            
            print("ROC AUC / PR AUC не могут быть посчитаны.")

        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        recall = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
        precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)

        print(f"F1 Score (Class 1): {f1:.4f}")
        print(f"Recall (Class 1): {recall:.4f}")
        print(f"Precision (Class 1): {precision:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, target_names=['Лоялен (0)', 'Отток (1)'], zero_division=0))
        print("Confusion Matrix:")
        cm = confusion_matrix(y_test, y_pred)
        print(cm)
        
        return {'roc_auc': roc_auc, 'f1_score_1': f1, 'recall_1': recall, 'precision_1': precision, 'pr_auc': pr_auc}
    
    except Exception as e:
        print(f"Ошибка при оценке модели {model_name}: {e}")
        return None

In [10]:
def objective(trial):
    """Функция, которую Optuna будет минимизировать/максимизировать."""
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 500, 6000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 80),
        'max_depth': trial.suggest_int('max_depth', 2, 7),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 150),
        'class_weight': 'balanced',
        'random_state': RANDOM_SEED,
        'n_jobs': -1,
        'verbose': -1
    }

    cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_SEED)
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        pruning_callback = LightGBMPruningCallback(trial, 'auc')

        model = lgb.LGBMClassifier(**params)
        try:
            model.fit(X_train_fold, y_train_fold,
                      eval_set=[(X_val_fold, y_val_fold)],
                      eval_metric='auc',
                      callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS, verbose=False),
                                 pruning_callback]
                     )
            preds_proba = model.predict_proba(X_val_fold)[:, 1]
            fold_auc = roc_auc_score(y_val_fold, preds_proba)
            cv_scores.append(fold_auc)
        except optuna.TrialPruned as e:
             raise e
        except Exception as e:
            print(f"Ошибка в Trial {trial.number}, Fold {fold}: {e}")
            return 0.0

    if not cv_scores:
        return 0.0

    mean_auc = np.mean(cv_scores)
    return mean_auc

In [11]:
print(f"\n--- Запуск Optuna для исследования '{STUDY_NAME}' ---")
print(f"Хранилище: {OPTUNA_DB_URL}")
print(f"Количество испытаний: {N_TRIALS}")
print(f"Количество фолдов CV: {N_SPLITS_CV}")
print(f"Метрика оптимизации: ROC AUC (Maximize)")

storage = optuna.storages.RDBStorage(url=OPTUNA_DB_URL)

study = optuna.create_study(
    storage=storage,
    study_name=STUDY_NAME,
    direction='maximize',
    load_if_exists=True
)

try:
    study.optimize(objective, n_trials=N_TRIALS, timeout=None)
except KeyboardInterrupt:
    print("Оптимизация прервана пользователем.")


--- Запуск Optuna для исследования 'lgbm_churn_optimization_v1' ---
Хранилище: sqlite:///optuna_lgbm_churn.db
Количество испытаний: 1000
Количество фолдов CV: 5
Метрика оптимизации: ROC AUC (Maximize)


[I 2025-03-26 17:03:41,911] A new study created in RDB with name: lgbm_churn_optimization_v1
[I 2025-03-26 17:04:13,674] Trial 0 finished with value: 0.8636621047663994 and parameters: {'n_estimators': 3400, 'learning_rate': 0.014167679979322917, 'num_leaves': 20, 'max_depth': 6, 'reg_alpha': 0.2560838647562468, 'reg_lambda': 0.012980554682202477, 'colsample_bytree': 0.9810306059256777, 'subsample': 0.6160566256653015, 'subsample_freq': 3, 'min_child_samples': 69}. Best is trial 0 with value: 0.8636621047663994.
[I 2025-03-26 17:05:44,937] Trial 1 finished with value: 0.8615018636052817 and parameters: {'n_estimators': 5600, 'learning_rate': 0.021415437040384924, 'num_leaves': 70, 'max_depth': 2, 'reg_alpha': 0.41094741485257963, 'reg_lambda': 6.522994159748028, 'colsample_bytree': 0.7602407786445065, 'subsample': 0.8621636193160056, 'subsample_freq': 1, 'min_child_samples': 120}. Best is trial 0 with value: 0.8636621047663994.
[I 2025-03-26 17:06:36,408] Trial 2 finished with value: 0

In [12]:
print("\n--- Результаты оптимизации Optuna ---")
print(f"Количество завершенных испытаний: {len(study.trials)}")

best_trial = study.best_trial
print(f"Лучшее испытание (Trial {best_trial.number}):")
print(f"  Value (ROC AUC на CV): {best_trial.value:.5f}")
print("  Best Parameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")


--- Результаты оптимизации Optuna ---
Количество завершенных испытаний: 1000
Лучшее испытание (Trial 136):
  Value (ROC AUC на CV): 0.86537
  Best Parameters:
    n_estimators: 4400
    learning_rate: 0.0231336496056604
    num_leaves: 21
    max_depth: 7
    reg_alpha: 0.013084385071543842
    reg_lambda: 0.40470051985952027
    colsample_bytree: 0.7837665479281083
    subsample: 0.8341784879213847
    subsample_freq: 4
    min_child_samples: 34


In [13]:
best_params = best_trial.params

best_params['objective'] = 'binary'
best_params['metric'] = 'auc'
best_params['boosting_type'] = 'gbdt'
best_params['random_state'] = RANDOM_SEED
best_params['n_jobs'] = -1
best_params['class_weight'] = 'balanced'

In [14]:
print("\n--- Обучение финальной модели LightGBM с лучшими параметрами ---")

final_model = lgb.LGBMClassifier(**best_params)

final_model.fit(X_train, y_train)

print("Финальная модель обучена.")


--- Обучение финальной модели LightGBM с лучшими параметрами ---
Финальная модель обучена.


In [15]:
print(f"\n--- Сохранение финальной модели в {MODEL_PATH} ---")
try:
    joblib.dump(final_model, MODEL_PATH)
    print("Модель успешно сохранена.")
except Exception as e:
    print(f"Ошибка при сохранении модели: {e}")


--- Сохранение финальной модели в ../models/lgbm_churn_model.joblib ---
Модель успешно сохранена.


In [16]:
results_lgbm = evaluate_model(final_model, X_test, y_test, "LightGBM (Optimized)")

print("\n--- Сравнение с Baseline (Logistic Regression) ---")

results_log_reg = {'roc_auc': 0.8443, 'f1_score_1': 0.5547, 'recall_1': 0.7543, 'precision_1': 0.4386, 'pr_auc': 0.6310}

comparison_data = {}
if results_lgbm:
    comparison_data["LightGBM (Optimized)"] = results_lgbm
    
if comparison_data:
    comparison_df = pd.DataFrame(comparison_data).T
    print(comparison_df[['roc_auc', 'pr_auc', 'f1_score_1', 'recall_1', 'precision_1']].round(4))
else:
    print("Не удалось сформировать таблицу сравнения.")


--- Оценка модели: LightGBM (Optimized) ---
ROC AUC: 0.8303
PR AUC (Average Precision): 0.6437
F1 Score (Class 1): 0.5576
Recall (Class 1): 0.5528
Precision (Class 1): 0.5625

Classification Report:
              precision    recall  f1-score   support

  Лоялен (0)       0.89      0.89      0.89      1593
   Отток (1)       0.56      0.55      0.56       407

    accuracy                           0.82      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.82      0.82      0.82      2000

Confusion Matrix:
[[1418  175]
 [ 182  225]]

--- Сравнение с Baseline (Logistic Regression) ---
                      roc_auc  pr_auc  f1_score_1  recall_1  precision_1
LightGBM (Optimized)   0.8303  0.6437      0.5576    0.5528       0.5625


In [17]:
print("\n--- Optuna Dashboard ---")
print("Оптимизация завершена или прервана.")
print("\nНаходясь в директории проекта, выполните команду:")
print(f"optuna-dashboard {OPTUNA_DB_URL}")


--- Optuna Dashboard ---
Оптимизация завершена или прервана.

Находясь в директории проекта, выполните команду:
optuna-dashboard sqlite:///optuna_lgbm_churn.db
