In [None]:
import os

import psycopg
import pandas as pd
import mlflow
from autofeat import AutoFeatClassifier
from catboost import CatBoostClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)
import matplotlib.pyplot as plt

In [None]:
os.mkdir("ASSETS")

In [None]:
PREPROCESSED_PATH = "/Users/nikolaistepanov/YandexPracticum/data/preprocessed_realty.xlsx"

features = [] # список признаков, которые отбирает студент
target = '' # колонка с таргет студента

In [None]:
estimator = RandomForestClassifier(n_estimators=300)

# количетсво признаков и метрику студент определяет самостоятельно
# как пример forward feature selection
sfs = SFS(estimator, 
          k_features=10, 
          forward=True, 
          floating=False, 
          scoring='mean_absolute_error',
          cv=4,
          n_jobs=-1)

# как пример backward feature selection
sbs = SFS(estimator, 
          k_features=10, 
          forward=False, 
          floating=False, 
          scoring='mean_absolute_error',
          cv=4,
          n_jobs=-1)

In [None]:
df = pd.read_csv(PREPROCESSED_PATH)

### Длаьше студент должен разбить их на train/val/test. Возможен и другой вариант. Но в любом случае будет хорошо, если он пояснит почему он решил разбить именно так

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=0.2, # у студента может быть другое
    shuffle=False, # аналогично выше
)

In [None]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

In [None]:
sfs = sfs.fit(X_train_features, y_train)
top_sfs = sfs.k_feature_names_

In [None]:
print('\nSequential Forward Selection (k=10):')
print(sfs.k_feature_idx_)
print('CV Score:')
print(sfs.k_score_)

In [None]:
sfs_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
sfs_df.head(1)

In [None]:
sfs_df.to_csv("ASSETS/sfs.csv")

In [None]:
fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

plt.savefig(f"{FS_ASSETS}/sfs.png")

In [None]:
%%time


sbs = sbs.fit(X_train_features, y_train)
top_sbs = sbs.k_feature_names_

In [None]:
print('\nSequential Backward Selection (k=10):')
print(sbs.k_feature_idx_)
print('CV Score:')
print(sbs.k_score_)

In [None]:
sbs_df = pd.DataFrame.from_dict(sbs.get_metric_dict()).T

In [None]:
sbs_df.to_csv("ASSETS/sbs.csv")

In [None]:
fig = plot_sfs(sbs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

plt.savefig("FS_ASSETS/sbs.png")

In [None]:
interc_features = list(set(top_sbs) & set(top_sfs))

In [None]:
union_features = list(set(top_sbs) | set(top_sfs))

In [None]:
### Дальше студент обучает свою модель на отобранных признаках и логирует в mlflow

In [None]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "fs"
REGISTRY_MODEL_NAME = "churn_model_nikolaistepanov_prepared"
FS_ASSETS = "fs_assets"

In [None]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [None]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [None]:
metrics_interc = {"auc": 0.9}
pip_requirements="../requirements.txt"
signature = mlflow.models.infer_signature(X_test_features[interc_features], prediction_interc)
input_example = X_test_features[interc_features][:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=f"{RUN_NAME}_intersection", experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics_interc)
    mlflow.log_artifacts("ASSETS")
    model_info = mlflow.catboost.log_model(
        cb_model=model_interc, 
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )
    mlflow.log_params() # тут студент логирует параметры модели