##### 1. Установим нужные библиотеки

In [1]:
import os

import psycopg
import pandas as pd
import numpy as np

##### 1. Определим глобальные перменные

In [2]:
TABLE_NAME = "users_churn"

##### 2. Заберем данные из базы данных и сформируем `dataframe`

In [3]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [4]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 2. Посмотрим какие методы можно использовать для генерации новых признаков:

##### 4.3 Проверим, что загруженный `ColumnTransformer` работает точно также, как и оригинальный

In [95]:
features = (
    list(transformed_df.columns) + \
    [
        "days_diff", "diff_years", "diff_months"
    ] + \
    [
        'monthly_charges', 'total_charges', 'online_security', 'online_backup',
        'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
        'senior_citizen', 'partner', 'dependents', 'multiple_lines'
    ]
)

target = "target"

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 400
verbose = False
max_depth=3

model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
    max_depth=max_depth
)

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [96]:
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_No data,cat__gender_Male,current_date,days_diff,diff_years,diff_months
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,1,Electronic check,29.85,29.85,DSL,...,0.0,1.0,0.0,0.0,0.0,0.0,2023-10-31 13:55:36.785985,1399,3,9
1,2,5575-GNVDE,2017-04-01,NaT,One year,0,Mailed check,56.95,1889.5,DSL,...,0.0,0.0,1.0,0.0,0.0,1.0,2023-10-31 13:55:36.785985,2404,6,6


In [97]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 51)
Размер выборки для теста: (1409, 51)


In [98]:
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x2905cb510>

In [101]:
prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

In [103]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [105]:
pip_requirements="../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    cv_info = mlflow.sklearn.log_model(preprocessor, artifact_path="preprocessor")
    model_info = mlflow.catboost.log_model(
        cb_model=model, 
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_nikolaistepanov_prepared' already exists. Creating a new version of this model...
2023/10/31 13:59:04 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov_prepared, version 9
Created version '9' of model 'churn_model_nikolaistepanov_prepared'.
