In [5]:
import os

import psycopg
import pandas as pd
import mlflow
from catboost import CatBoostClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"],
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}

connection.update(postgres_credentials)


TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov_myown"
RUN_NAME = "feature_selection"
REGISTRY_MODEL_NAME = "hp_tuning_model" 
FS_ASSETS = "fs_assets"


with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

yes_no_map = {"Yes": 1, "No": 0}

for col in df.columns:
    if df[col].dtype == "object":
        unique_vals = set(df[col].dropna().unique())
        if unique_vals.issubset({"Yes", "No"}):
            df[col] = df[col].map(yes_no_map)


df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,0,Mailed check,20.65,1022.95,,...,,,,,Female,0,0,0,0.0,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,0,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,1,1,1.0,0
2,147,6837-BJYDQ,2019-11-01,NaT,One year,0,Mailed check,19.6,61.35,,...,,,,,Male,0,0,0,0.0,0
3,481,0486-LGCCH,2019-03-01,NaT,Two year,0,Mailed check,19.65,225.75,,...,,,,,Male,0,1,1,0.0,0
4,1001,8357-EQXFO,2019-04-01,2019-11-01,Month-to-month,1,Electronic check,95.35,660.9,Fiber optic,...,1.0,0.0,1.0,1.0,Female,0,0,0,0.0,1


In [6]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = "target"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)


print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    "depth": [4, 6],
    "learning_rate": [0.05, 0.1],
    "l2_leaf_reg": [3, 10],
}

model = CatBoostClassifier(
    iterations=iterations,
    loss_function=loss_function,
    task_type=task_type,
    random_seed=random_seed,
    verbose=verbose
)

cv = GridSearchCV(
    estimator=model,
    param_grid=params,
    scoring="roc_auc",
    cv=2,
    n_jobs=-1
)

clf = cv.fit(X_train, y_train)

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


In [7]:
from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    log_loss
)

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = CatBoostClassifier(
    iterations=iterations,
    loss_function=loss_function,
    task_type=task_type,
    random_seed=random_seed,
    verbose=verbose,
    **best_params
)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas) # площадь под ROC-кривой
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации

metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()
metrics["mean_test_score"] = cv_results["mean_test_score"].mean()
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

In [8]:
pip_requirements= "./requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.log_params(best_params)

    mlflow.log_param("iterations", iterations)
    mlflow.log_param("loss_function", loss_function)
    mlflow.log_param("task_type", task_type)
    mlflow.log_param("random_seed", random_seed)
    
    mlflow.log_metrics(metrics)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    
    model_info = mlflow.catboost.log_model(
        cb_model=model_best,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )
    
    run_id = run.info.run_id
    print("RUN ID:", run_id)

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'hp_tuning_model'.
2026/02/12 09:39:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: hp_tuning_model, version 1


RUN ID: 86c00e507fc8482694bb0543a31231f3


Created version '1' of model 'hp_tuning_model'.


In [16]:
from scipy.stats import randint, uniform
EXPERIMENT_NAME = "churn_nikolaistepanov_myown"
RUN_NAME = 'model_random_search'
REGISTRY_MODEL_NAME = "hp_tuning_model_random"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = "target"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

param_distributions = {
    "depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.29),
    "l2_leaf_reg": randint(1, 20),
    "random_strength": uniform(0, 2),
    "bagging_temperature": uniform(0, 1),
    "min_data_in_leaf": randint(1, 50)
}

model = CatBoostClassifier(
    iterations=iterations,
    loss_function=loss_function,
    task_type=task_type,
    random_seed=random_seed,
    verbose=verbose
)

cv = RandomizedSearchCV(
    estimator=model,
    n_iter=20,
    param_distributions=param_distributions,
    scoring="roc_auc",
    cv=2,
    n_jobs=-1
)

clf = cv.fit(X_train, y_train)


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model = CatBoostClassifier(
    iterations=iterations,
    loss_function=loss_function,
    task_type=task_type,
    random_seed=random_seed,
    verbose=verbose,
    **best_params
)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas) # площадь под ROC-кривой
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации

metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()
metrics["mean_test_score"] = cv_results["mean_test_score"].mean()
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

pip_requirements= "./requirements.txt"# файл с зависимостями
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]


experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.log_params(best_params)

    mlflow.log_param("iterations", iterations)
    mlflow.log_param("loss_function", loss_function)
    mlflow.log_param("task_type", task_type)
    mlflow.log_param("random_seed", random_seed)
    
    mlflow.log_metrics(metrics)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    
    model_info = mlflow.catboost.log_model(
        cb_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )
    
    run_id = run.info.run_id
    print("RUN ID:", run_id)

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'hp_tuning_model_random'.
2026/02/12 09:50:33 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: hp_tuning_model_random, version 1


RUN ID: 352730e373bb45888fb19c70e64fbc0a


Created version '1' of model 'hp_tuning_model_random'.
