In [58]:
import os

import psycopg
import pandas as pd
from numpy import random
import mlflow
from catboost import CatBoostClassifier
from scipy.stats import uniform, loguniform, randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)

In [22]:
random.seed(0)

##### 1. Определим глобальные перменные

In [23]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "model_random_search"
REGISTRY_MODEL_NAME = "churn_model_nikolaistepanov_prepared"

##### 2. Заберем данные

In [24]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [25]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 3. Разделим данные на train, test по `begin_date`

In [26]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = ["type"]
test_size = 0.2

In [27]:
df = df.sort_values(by=[split_column])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [29]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


##### 4. Определим гиперпараметры модели, которые хотим подобрать, и диапазоны возможных значений для них

In [59]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

param_distributions = {
    "learning_rate": loguniform(0.001, 0.1),
    "depth": randint(2, 5),
    "l2_leaf_reg": uniform(0.1, 5),
    "random_strength": uniform(0.1, 5),
}

In [60]:
model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
)

##### 5. Определим объект класса `RandomizedSearchCV`

In [61]:
cv = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_distributions, 
    n_jobs=-1, 
    cv=2,
    n_iter=20,
)

##### 6. Запустим побор гиперпараметров

In [62]:
%%time

clf = cv.fit(X_train, y_train)

CPU times: user 452 ms, sys: 463 ms, total: 914 ms
Wall time: 2.17 s


##### 7. Соберем аналитику по подбору и залогируем в `MLFlow`

In [63]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [64]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [65]:
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

In [66]:
cv_results.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_l2_leaf_reg,param_learning_rate,param_random_strength,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.406601,0.048693,0.005595,0.002256,4,3.165317,0.063782,0.596402,"{'depth': 4, 'l2_leaf_reg': 3.165317289420662,...",0.785942,0.371672,0.578807,0.207135,19
1,0.358771,0.007257,0.005258,0.001884,3,3.433834,0.021942,1.151913,"{'depth': 3, 'l2_leaf_reg': 3.4338335772283384...",0.785942,0.478523,0.632233,0.15371,14


In [67]:
model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
    **best_params,
)

In [68]:
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x288c711d0>

In [69]:
prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

In [70]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

metrics["mean_fit_time"] = cv_results["mean_fit_time"].mean()
metrics["std_fit_time"] = cv_results["std_fit_time"].mean()
metrics["mean_test_score"] = cv_results["mean_test_score"].mean()
metrics["std_test_score"] = cv_results["std_test_score"].mean()

metrics["best_score"] = clf.best_score_

best_params = clf.best_params_

In [74]:
metrics

{'err1': 0.0511000709723208,
 'err2': 0.11994322214336409,
 'auc': 0.7089373592291113,
 'precision': 0.7012448132780082,
 'recall': 0.2549019607843137,
 'f1': 0.3738938053097345,
 'logloss': 14.478855797189713,
 'mean_fit_time': 0.3171637415885925,
 'std_fit_time': 0.027821803092956544,
 'mean_test_score': 0.6784256301029464,
 'std_test_score': 0.10751686190983319,
 'best_score': 0.7518636847710329}

In [73]:
best_params

{'depth': 4,
 'l2_leaf_reg': 3.0325646740504157,
 'learning_rate': 0.0010970213817508296,
 'random_strength': 4.2447001460868155}

In [72]:
pip_requirements="../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path="cv")
    model_info = mlflow.catboost.log_model(
        cb_model=model, 
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_nikolaistepanov_prepared' already exists. Creating a new version of this model...
2023/10/23 15:34:52 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov_prepared, version 5
Created version '5' of model 'churn_model_nikolaistepanov_prepared'.
