#### В начале студент загружает модели и данные. Эти этапы можно посмотреть в предыдущих ноутбуках

In [None]:
from collections import defaultdict
import os

import psycopg
import pandas as pd
from numpy import random, array, median
import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
import optuna
from optuna.integration.mlflow import MLflowCallback
from catboost import CatBoostClassifier
from scipy.stats import uniform, loguniform, randint
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)

### Начнем с optuna с параллельным логирвоание в MLflow

In [None]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "fs"
REGISTRY_MODEL_NAME = "student_model"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "student_model"

In [None]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [None]:
# Пример определения objective
def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }

    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
    
        model.fit(train_x, train_y)
    
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    err_1 = median(array(metrics['err1']))
    err_2 = median(array(metrics['err2']))
    auc = median(array(metrics['auc']))
    precision = median(array(metrics['precision']))
    recall = median(array(metrics['recall']))
    f1 = median(array(metrics['f1']))
    logloss = median(array(metrics['logloss']))

    return auc

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs = {
        "experiment_id": experiment_id, 
        "tags": {MLFLOW_PARENT_RUN_ID: run_id}
    }
)

In [None]:
study = optuna.create_study(
    sampler=optuna.samplers.TPESampler(), 
    direction="maximize", 
    study_name=STUDY_NAME,
    storage=STUDY_DB_NAME,
)

study.optimize(objective, n_trials=10, timeout=600, callbacks=[mlflc],)

In [None]:
best_params = study.best_params
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

### Пример для [Random Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                              random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
                     penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_

### Пример для [Halving Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingGridSearchCV.html)

In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV


X, y = load_iris(return_X_y=True)
clf = RandomForestClassifier(random_state=0)
param_grid = {"max_depth": [3, None],
              "min_samples_split": [5, 10]}
search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',
                             max_resources=10,
                             random_state=0).fit(X, y)
search.best_params_  

### Студент делает минимум два варианта отбора признаков и затем логирует модель

In [None]:
pip_requirements = "../../requirements.txt"

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)
    model_final = ... # финальная модель студента
    model_info = mlflow.catboost.log_model(
        cb_model=model_final,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60, # опционально
        pip_requirements=pip_requirements,
        # signature=signature,
        # input_example=input_example,
        # metadata=metadata,
    )