##### 1. Установим новые библиотеки

In [1]:
# %pip install optuna==3.4.0

In [2]:
from collections import defaultdict
import os

import psycopg
import pandas as pd
from numpy import random, array, median
import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
import optuna
from optuna.integration.mlflow import MLflowCallback
from catboost import CatBoostClassifier
from scipy.stats import uniform, loguniform, randint
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)


* 'schema_extra' has been renamed to 'json_schema_extra'


In [3]:
random.seed(0)

##### 1. Определим глобальные перменные

In [4]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

##### 2. Заберем данные

In [5]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [6]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 3. Подключимся к MLFlow

In [7]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [8]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

##### 4. Разделим данные на train, test по `begin_date`

In [9]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = ["type"]
test_size = 0.2

In [10]:
df = df.sort_values(by=[split_column])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [12]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


##### 4. Определим функцию оптимизации

In [13]:
def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }

    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
    
        model.fit(train_x, train_y)
    
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    err_1 = median(array(metrics['err1']))
    err_2 = median(array(metrics['err2']))
    auc = median(array(metrics['auc']))
    precision = median(array(metrics['precision']))
    recall = median(array(metrics['recall']))
    f1 = median(array(metrics['f1']))
    logloss = median(array(metrics['logloss']))

    return auc

##### 6. Запустим побор гиперпараметров
- в начале создаем общий `run` для сессии подбора гиперпараметров
- инициализируем инциализацию, указываю
    - какой алгоритм оптимизации использовать
    - максимизировать или минимизировать нашу метрику
    - `storage` для хранения данных об обучении

In [14]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs = {
        "experiment_id": experiment_id, 
        "tags": {MLFLOW_PARENT_RUN_ID: run_id}
    }
)

  mlflc = MLflowCallback(


In [15]:
study = optuna.create_study(
    sampler=optuna.samplers.TPESampler(), 
    direction="maximize", 
    study_name=STUDY_NAME,
    storage=STUDY_DB_NAME,
)

study.optimize(objective, n_trials=10, timeout=600, callbacks=[mlflc],)

[I 2023-10-24 15:07:32,736] A new study created in RDB with name: churn_model
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-10-24 15:07:33,371] Trial 0 finished with value: 0.7557831615372861 and parameters: {'learning_rate': 0.0033036940127322287, 'depth': 7, 'l2_leaf_reg': 0.782998040417358, 'random_strength': 2.047038605997796}. Best is trial 0 with value: 0.7557831615372861.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-10-24 15:07:35,294] Trial 1 finished with value: 0.8157934119686224 and parameters: {'learning_rate': 0.030985469108463864, 'depth': 3, 'l2_leaf_reg': 4.3912842977807856, 'random_strength': 4.420541317229445}. Best is trial 1 with value: 0.8157934119686224.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-10-24 15:07:37,181] Trial 2 finished with value: 0.8059932648980886 and parameters: {'learning_rate': 0.06612073619991429, 'depth': 2, 'l2_leaf_reg': 1.4381690694349036, 'random_strength': 2.760524313457521}. Best is 

In [16]:
best_params = study.best_params
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

Number of finished trials: 10
Best params: {'learning_rate': 0.030985469108463864, 'depth': 3, 'l2_leaf_reg': 4.3912842977807856, 'random_strength': 4.420541317229445}


##### 7. Можем продолжить наше обучение с той точки, где мы остановились

In [17]:
assert os.path.exists(STUDY_DB_NAME.split("///", 1)[1])

loaded_study = optuna.load_study(study_name=STUDY_NAME, storage=STUDY_DB_NAME)

assert len(loaded_study.trials) == len(study.trials)

In [18]:
loaded_study.optimize(objective, n_trials=10, timeout=600, callbacks=[mlflc],)

  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-10-24 15:07:55,742] Trial 10 finished with value: 0.8109966290207813 and parameters: {'learning_rate': 0.02421741839725348, 'depth': 4, 'l2_leaf_reg': 4.7850523230821205, 'random_strength': 4.999843567352545}. Best is trial 1 with value: 0.8157934119686224.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-10-24 15:07:57,716] Trial 11 finished with value: 0.817358179191067 and parameters: {'learning_rate': 0.025089665785433292, 'depth': 4, 'l2_leaf_reg': 4.874594167919647, 'random_strength': 4.987681820966244}. Best is trial 11 with value: 0.817358179191067.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-10-24 15:07:59,667] Trial 12 finished with value: 0.8120157185512282 and parameters: {'learning_rate': 0.024889330634369492, 'depth': 4, 'l2_leaf_reg': 4.956936565456401, 'random_strength': 4.132477060855784}. Best is trial 11 with value: 0.817358179191067.
  _warn_prf(average, modifier, msg_sta

In [19]:
new_best_params = loaded_study.best_params
print(f"Number of finished trials: {len(loaded_study.trials)}")
print(f"Best params: {new_best_params}")

Number of finished trials: 20
Best params: {'learning_rate': 0.028645594831218812, 'depth': 3, 'l2_leaf_reg': 3.1452137749925777, 'random_strength': 3.4437692936077338}
