In [13]:
import os
import numpy as np
import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix,
)
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

import psycopg
from dotenv import load_dotenv
load_dotenv()

from sklearn.linear_model import LinearRegression
from autofeat import AutoFeatClassifier
from sklearn.impute import SimpleImputer

In [8]:
os.environ["DB_DESTINATION_HOST"] = os.getenv("DB_DESTINATION_HOST")
os.environ["DB_DESTINATION_PORT"] = os.getenv("DB_DESTINATION_PORT")
os.environ["DB_DESTINATION_NAME"] = os.getenv("DB_DESTINATION_NAME")
os.environ["DB_DESTINATION_USER"] = os.getenv("DB_DESTINATION_USER")
os.environ["DB_DESTINATION_PASSWORD"] = os.getenv("DB_DESTINATION_PASSWORD")

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

# определяем глобальные переменные
# поднимаем MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000


registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)

# название тестового эксперимента и запуска (run) внутри него
EXPERIMENT_NAME = "real_churn_sergey_sh_final"
RUN_NAME = "feature_autogen"
REGISTRY_MODEL_NAME = "churn_model_sergey_sh"
FS_ASSETS = "fs_assets" 

In [9]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"], 
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

# определим название таблицы, в которой хранятся наши данные.
TABLE_NAME = "clean_users_churn"

# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

# создаёт объект курсора для выполнения запросов к базе данных
# с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
                # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

                # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7795-CFOCW,2016-05-01,NaT,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,...,Yes,Yes,No,No,Male,0,No,No,No,0
1,2,9237-HQITU,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,...,No,No,No,No,Female,0,No,No,No,1
2,3,9305-CDSKC,2019-03-01,2019-11-01,Month-to-month,Yes,Electronic check,99.65,820.5,Fiber optic,...,Yes,No,Yes,Yes,Female,0,No,No,Yes,1
3,4,1452-KIOVK,2018-04-01,NaT,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,Fiber optic,...,No,No,Yes,No,Male,0,No,Yes,Yes,0
4,5,6713-OKOMC,2019-04-01,NaT,Month-to-month,No,Mailed check,29.75,301.9,DSL,...,No,No,No,No,Female,0,No,No,No,0


In [10]:
split_column = "begin_date"
test_size = 0.2

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]
target = ['target']

features = cat_features + num_features

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

transformations = ('1/', 'log', 'abs', 'sqrt')

afc = AutoFeatClassifier(categorical_cols=cat_features, transformations=transformations, feateng_steps=1, n_jobs=-1)

X_train_features = afc.fit_transform(X_train,  y_train)
X_test_features = afc.transform(X_test)

  y = column_or_1d(y, warn=True)
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


In [12]:
artifact_path = "afc"
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path)

2025-09-25 15:13:05,759 INFO: Found credentials in environment variables.


In [16]:
model = RandomForestClassifier()
model.fit(X_train_features, y_train)

# Предсказания
prediction = model.predict(X_test_features)

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss
# импортируйте необходимые вам модули

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, err2, _ = confusion_matrix(y_test, prediction,normalize='all').ravel()
#auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
#metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

pip_requirements = "requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]
metadata =  {'model_type': 'model_with_autogen_features'}

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

# --- Формирование сигнатуры для логирования --
input_schema = Schema([ColSpec("double", name) for name in X_train.columns])
output_schema = Schema([ColSpec("double")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.log_metrics(metrics)

    # Пример логгирования артефактов
    with open('columns.txt', 'w') as f:
        f.writelines([col + '\n' for col in df.columns])
    df.to_csv("users_churn.csv", index=False)
    mlflow.log_artifact("columns.txt", "dataframe")
    mlflow.log_artifact("users_churn.csv", "dataframe")

    mlflow.set_tags({
        "project": "logiruem_model",
        "team": "data_science",
        "version": "4.0"
    })
    # логируем метрики
    mlflow.log_metrics(metrics)

    # Логируем полный пайплайн
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model_pipeline",
        signature=signature,
        input_example=X_train.head(2),
        registered_model_name=REGISTRY_MODEL_NAME
    )

    afc_info = mlflow.sklearn.log_model(afc, artifact_path='afc') 


# Очистка временных файлов
for filename in ['columns.txt', 'users_churn.csv']:
    if os.path.exists(filename):
        os.remove(filename)

  return fit_method(estimator, *args, **kwargs)
  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_sergey_sh' already exists. Creating a new version of this model...
2025/09/25 15:35:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_sergey_sh, version 5
Created version '5' of model 'churn_model_sergey_sh'.


In [17]:
import mlflow
from mlflow.models import ModelSignature
from mlflow.types import Schema, ColSpec
import os

model = RandomForestClassifier()
model.fit(X_train_features, y_train)

# Предсказания
prediction = model.predict(X_test_features)
probas = model.predict_proba(X_test_features)[:, 1]  # для AUC

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss

# Расчет метрик
metrics = {}

# Матрица ошибок
cm = confusion_matrix(y_test, prediction)
total = len(y_test)
err1 = cm[0, 1] / total if total > 0 else 0  # False Positive Rate
err2 = cm[1, 0] / total if total > 0 else 0  # False Negative Rate

precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, probas)
logloss = log_loss(y_test, probas)

# Запись метрик
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# Настройки MLflow
EXPERIMENT_NAME = "boston_housing_experiment"
RUN_NAME = "random_forest_with_features"
REGISTRY_MODEL_NAME = "boston_rf_model"

# Получение или создание эксперимента
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

# Создание реальных артефактов для логирования
# 1. Сохранение важности признаков
feature_importance = pd.DataFrame({
    'feature': X_train_features.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance.to_csv('feature_importance.csv', index=False)

# 2. Сохранение описания модели
with open('model_info.txt', 'w') as f:
    f.write(f"RandomForestClassifier\n")
    f.write(f"Number of features: {X_train_features.shape[1]}\n")
    f.write(f"Number of estimators: {model.n_estimators}\n")

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # Логируем параметры модели
    mlflow.log_params({
        "n_estimators": model.n_estimators,
        "max_depth": model.max_depth,
        "random_state": model.random_state if hasattr(model, 'random_state') else None
    })
    
    # Логируем метрики
    mlflow.log_metrics(metrics)

    # Логируем артефакты
    mlflow.log_artifact("feature_importance.csv", "model_info")
    mlflow.log_artifact("model_info.txt", "model_info")

    mlflow.set_tags({
        "project": "boston_housing",
        "team": "data_science",
        "model_type": "random_forest",
        "version": "1.0"
    })

    # Создаем сигнатуру для модели
    input_schema = Schema([ColSpec("double", name) for name in X_train_features.columns])
    output_schema = Schema([ColSpec("long")])  # для классификации
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)

    # Логируем модель
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train_features.head(2),
        registered_model_name=REGISTRY_MODEL_NAME
    )

# Очистка временных файлов
for filename in ['feature_importance.csv', 'model_info.txt']:
    if os.path.exists(filename):
        os.remove(filename)

print("MLflow run completed successfully!")

  return fit_method(estimator, *args, **kwargs)
Successfully registered model 'boston_rf_model'.
2025/09/25 15:41:48 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: boston_rf_model, version 1


MLflow run completed successfully!


Created version '1' of model 'boston_rf_model'.
