In [1]:
import os
import psycopg2 as psycopg
import pandas as pd
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from autofeat import AutoFeatClassifier
load_dotenv(override=True)

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"],
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}

assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
YOUR_NAME = "Slava"

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)


yes_no_map = {"Yes": 1, "No": 0}

for col in df.columns:
    if df[col].dtype == "object":
        unique_vals = set(df[col].dropna().unique())
        if unique_vals.issubset({"Yes", "No"}):
            df[col] = df[col].map(yes_no_map)


df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,0,Mailed check,20.65,1022.95,,...,,,,,Female,0,0,0,0.0,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,0,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,1,1,1.0,0
2,147,6837-BJYDQ,2019-11-01,NaT,One year,0,Mailed check,19.6,61.35,,...,,,,,Male,0,0,0,0.0,0
3,481,0486-LGCCH,2019-03-01,NaT,Two year,0,Mailed check,19.65,225.75,,...,,,,,Male,0,1,1,0.0,0
4,1001,8357-EQXFO,2019-04-01,2019-11-01,Month-to-month,1,Electronic check,95.35,660.9,Fiber optic,...,1.0,0.0,1.0,1.0,Female,0,0,0,0.0,1


In [2]:
EXPERIMENT_NAME = "churn_nikolaistepanov_myown"
RUN_NAME = "auto_feature_engineering" 

In [3]:
features = df.drop(columns=["id", "customer_id", "target"])
target = df["target"]

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])


X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=test_size,
    shuffle=False,
)

X_train = X_train.drop(columns=["begin_date", "end_date"])
X_test  = X_test.drop(columns=["begin_date", "end_date"])




In [4]:
cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
    "type"
]
num_features = ["monthly_charges", "total_charges"]

from sklearn.preprocessing import LabelEncoder

for col in cat_features:
    if X_train[col].dtype == "object":
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])


for col in num_features:
    median_val = X_train[col].median()
    X_train[col] = X_train[col].fillna(median_val)
    X_test[col]  = X_test[col].fillna(median_val)

# ---------
# Categorical NaNs → mode
# ---------
for col in cat_features:
    mode_val = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode_val)
    X_test[col]  = X_test[col].fillna(mode_val)

features = cat_features + num_features

transformations = ("1/", "log", "abs", "sqrt")

afc = AutoFeatClassifier(categorical_cols=cat_features, transformations=transformations, feateng_steps=1, n_jobs=-1)

X_train_features = afc.fit_transform(X_train, y_train)
X_test_features = afc.transform(X_test)

In [7]:

import mlflow
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")


artifact_path = "afc"
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path)

2026-02-10 08:50:25,047 INFO: Found credentials in environment variables.


In [10]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(verbose=False)

model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7fd9006d6bc0>

In [11]:

REGISTRY_MODEL_NAME = "cb_model_auto_features"

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.catboost.log_model(
        registered_model_name=REGISTRY_MODEL_NAME,
        cb_model=model,
        artifact_path="models",
        await_registration_for=60
    )

Successfully registered model 'cb_model_auto_features'.
2026/02/10 09:16:49 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: cb_model_auto_features, version 1
Created version '1' of model 'cb_model_auto_features'.
