In [14]:
import os

import psycopg
import pandas as pd
import mlflow
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)

##### 1. Определим глобальные перменные

In [15]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "model_0_versioning"
REGISTRY_MODEL_NAME = "churn_model_nikolaistepanov"

##### 2. Заберем данные из базы данных и сформируем `dataframe`

In [16]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [17]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 3. Достаем модель


P.S. Сейчас, дабы не блочится о модель от Вани - обучим самостоятельно на простых фичах и будет ее использовать,
как бейзлайн

P.P.S В 5-ом ноутбуке поменяет данные, чтобы получить другие метрики. Это нужно, чтобы потом можно было показать функционал сравнения метрик в `MLFlow`

In [18]:
%%time

from catboost import CatBoostClassifier


model = CatBoostClassifier(max_depth=4, iterations=512, verbose = False)
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

percent = 60
test_size = int(df.shape[0] - df.shape[0]/100 * percent)

X_train, X_test = df[features][:test_size], df[features][test_size:]
y_train, y_test = df[target][:test_size], df[target][test_size:]


model.fit(X_train, y_train, cat_features=[features[-1]])

CPU times: user 495 ms, sys: 610 ms, total: 1.11 s
Wall time: 379 ms


<catboost.core.CatBoostClassifier at 0x14fecda50>

##### 4. Прогоним модель на тестовых данных и залогируем метрики с моделью

In [19]:
prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

In [20]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

##### 5. Создадим окружение модели:
- библиотеки: `pip_requirements`
- входные/выходные данные: `signature`
- пример входных данных: `input_example`

In [22]:
pip_requirements="../requirements.txt" # can use conda_env or extra_pip_requirements
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

##### 6. Подключимся к MLFLow и зарегистрируем модель 2ой версии

In [89]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [24]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [25]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    model_info = mlflow.catboost.log_model(
        cb_model=model,
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

Registered model 'churn_model_nikolaistepanov' already exists. Creating a new version of this model...
2023/10/19 17:34:03 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov, version 2
Created version '2' of model 'churn_model_nikolaistepanov'.


#### Look at the log above

##### 7. Изменим `stage` у каждой модели

In [26]:
client = mlflow.MlflowClient()

In [67]:
models = client.search_model_versions(filter_string=f"name = '{REGISTRY_MODEL_NAME}'",)

In [81]:
model_name_1 = models[-1].name
model_version_1 = models[-1].version
model_stage_1 = models[-1].current_stage

model_name_2 = models[-2].name
model_version_2 = models[-2].version
model_stage_2 = models[-2].current_stage

In [82]:
print(f"Текущий stage модели 1: {model_stage_1}")
print(f"Текущий stage модели 2: {model_stage_2}")

Текущий stage модели 1: None
Текущий stage модели 2: None


In [83]:
client.transition_model_version_stage(model_name_1, model_version_1, "production")
client.transition_model_version_stage(model_name_2, model_version_2, "staging")

<ModelVersion: aliases=[], creation_timestamp=1697726042787, current_stage='Staging', description='', last_updated_timestamp=1697727542949, name='churn_model_nikolaistepanov', run_id='1bf428a8ab16490e83dbd2283b92c512', run_link='', source='s3://s3-student-mle-case/7/1bf428a8ab16490e83dbd2283b92c512/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>

##### 7. Переименнуем нашу модель

In [84]:
client.rename_registered_model(
    name=REGISTRY_MODEL_NAME, 
    new_name=f"{REGISTRY_MODEL_NAME}_prepared"
)

In [85]:
models = client.search_model_versions(filter_string=f"name = '{REGISTRY_MODEL_NAME}_prepared'",)

##### 8. Проверим наши изменения

In [87]:
model_stage_1 = models[-1].current_stage
model_stage_2 = models[-2].current_stage

In [88]:
print(f"Текущий stage модели 1: {model_stage_1}")
print(f"Текущий stage модели 2: {model_stage_2}")

Текущий stage модели 1: Production
Текущий stage модели 2: Staging
