In [2]:
import os

import psycopg
import pandas as pd
import mlflow
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)


* 'schema_extra' has been renamed to 'json_schema_extra'


##### 1. Определим глобальные перменные

In [3]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_nikolaistepanov"

##### 2. Заберем данные из базы данных и сформируем `dataframe`

In [4]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [5]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 3. Достаем модель


P.S. Сейчас, дабы не блочится о модель от Вани - обучим самостоятельно на простых фичах и будет ее использовать,
как бейзлайн

P.P.S В 4-ом ноутбуке поменяет данные, чтобы получить другие метрики. Это нужно, чтобы потом можно было показать функционал сравнения метрик в `MLFlow`

In [7]:
%%time

from catboost import CatBoostClassifier


model = CatBoostClassifier(max_depth=4, iterations=512, verbose = False)
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

percent = 40
test_size = int(df.shape[0] - df.shape[0]/100 * percent)

X_train, X_test = df[features][:test_size], df[features][test_size:]
y_train, y_test = df[target][:test_size], df[target][test_size:]


model.fit(X_train, y_train, cat_features=[features[-1]])

CPU times: user 655 ms, sys: 708 ms, total: 1.36 s
Wall time: 590 ms


<catboost.core.CatBoostClassifier at 0x103e6cfd0>

##### 4. Прогоним модель на тестовых данных и залогируем метрики с моделью

In [8]:
prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

In [9]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

##### 5. Создадим окружение модели:
- библиотеки: `pip_requirements`
- входные/выходные данные: `signature`
- пример входных данных: `input_example`

In [10]:
pip_requirements="../requirements.txt" # can use conda_env or extra_pip_requirements
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

  inputs = _infer_schema(model_input) if model_input is not None else None


##### 6. Подключимся к MLFLow и зарегистрируем модель

In [18]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [12]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [13]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    model_info = mlflow.catboost.log_model(
        cb_model=model,
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

Successfully registered model 'churn_model_nikolaistepanov'.
2023/10/19 17:19:32 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov, version 1
Created version '1' of model 'churn_model_nikolaistepanov'.


##### 6.1. Проверим, что модель работает

In [14]:
loaded_model = mlflow.catboost.load_model(model_uri=model_info.model_uri)
model_predictions = loaded_model.predict(X_test)

In [15]:
assert model_predictions.dtype == int

print(model_predictions[:10])

[0 0 1 0 1 0 0 0 1 0]
