##### 1. Установим нужные библиотеки

In [1]:
# %pip install autofeat==2.1.2

In [23]:
import os

import psycopg
import pandas as pd
import numpy as np
import mlflow
from autofeat import AutoFeatClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)


* 'schema_extra' has been renamed to 'json_schema_extra'


##### 1. Определим глобальные перменные

In [26]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "auto_feature_engineering"
REGISTRY_MODEL_NAME = "churn_model_nikolaistepanov_prepared"

In [6]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

##### 2. Заберем данные из базы данных и сформируем `dataframe`

In [7]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [8]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 3. Разделим данные на train, test по `begin_date`

In [9]:
cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features
target = "target"

split_column = "begin_date"
stratify_column = ["type"]
test_size = 0.2

In [10]:
df = df.sort_values(by=[split_column])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [12]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 16)
Размер выборки для теста: (1409, 16)


##### 4. Определим список трансформаций к нашим признакам

In [13]:
transformations = (
    "1/", "1+", "1-", 
    'exp', 'log', 'abs', 'sqrt',
    "2^"
)

##### 5. Запустим генератор признаков

In [14]:
afc = AutoFeatClassifier(
    categorical_cols=cat_features,
    transformations=transformations,
    feateng_steps=1,
    n_jobs=-1
)

In [15]:
%%time


X_train_features = afc.fit_transform(X_train, y_train)

CPU times: user 9.2 s, sys: 584 ms, total: 9.79 s
Wall time: 8.56 s


In [28]:
X_train_features.head(2)

Unnamed: 0,monthly_charges,total_charges,cat_paperless_billing_0,cat_paperless_billing_1,cat_payment_method_Bank transfer (automatic),cat_payment_method_Credit card (automatic),cat_payment_method_Electronic check,cat_payment_method_Mailed check,cat_internet_service_DSL,cat_internet_service_Fiber optic,cat_internet_service_No data,cat_online_security_0,cat_online_security_1,cat_online_backup_0,cat_online_backup_1,cat_device_protection_0,cat_device_protection_1,cat_tech_support_0,cat_tech_support_1,cat_streaming_tv_0,cat_streaming_tv_1,cat_streaming_movies_0,cat_streaming_movies_1,cat_gender_Female,cat_gender_Male,cat_senior_citizen_0,cat_senior_citizen_1,cat_partner_0,cat_partner_1,cat_dependents_0,cat_dependents_1,cat_multiple_lines_0,cat_multiple_lines_1,1/total_charges
0,104.15,7689.95,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.00013
1,117.8,8684.8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.000115


In [16]:
X_test_features = afc.transform(X_test)

##### 5. Обучим модель на автоматически сгенерированных признаках

In [17]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 400
verbose = False
max_depth=3

model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
    max_depth=max_depth
)

In [18]:
model.fit(X_train_features, y_train)

<catboost.core.CatBoostClassifier at 0x28b18b490>

In [19]:
prediction = model.predict(X_test_features)
probas = model.predict_proba(X_test_features)[:, 1]

##### 6. Посчитаем метрики и залогируем в `MLFlow`

In [20]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [21]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [24]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [27]:
pip_requirements="../requirements.txt"
signature = mlflow.models.infer_signature(X_test_features, prediction)
input_example = X_test_features[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    afc_info = mlflow.sklearn.log_model(afc, artifact_path="afc")
    model_info = mlflow.catboost.log_model(
        cb_model=model, 
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

Registered model 'churn_model_nikolaistepanov_prepared' already exists. Creating a new version of this model...
2023/11/01 13:30:55 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov_prepared, version 10
Created version '10' of model 'churn_model_nikolaistepanov_prepared'.
