In [None]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from mlflow.data.pandas_dataset import PandasDataset
from sklearn.base import ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

In [None]:
%store -r X y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [None]:
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

dataset_train = mlflow.data.from_numpy(
    X_train, targets=y_train.to_numpy(), name="credit-score-classification-train"
)
dataset_test = mlflow.data.from_numpy(
    X_test, targets=y_test.to_numpy(), name="credit-score-classification-test"
)

In [None]:
# Указываем доступ в БД
with open("../database.env", "r") as file:

    lines = file.readlines()
    user = lines[0].split("=")[-1][:-1]
    password = lines[1].split("=")[-1][:-1]
    db = lines[2].split("=")[-1]
print(user, password, db)
sql_string = f"postgresql://{user}:{password}@0.0.0.0/{db}"
mlflow.set_tracking_uri(sql_string)

In [None]:
class SklearnModel:
    def __init__(self, model: ClassifierMixin, shortname, parameters=None):
        self.model = model
        self.shortname = shortname
        self.parameters = parameters

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, ElasticNet

models = [
    SklearnModel(
        LogisticRegression(random_state=42),
        "logreg",
        {
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": [
                "liblinear",
                "saga",
            ],  # solvers that support multiclass classification
            "penalty": ["l1", "l2", "elasticnet"],  # l1 = Lasso, l2 = Ridge
            "max_iter": [100, 200, 500],
        },
    ),
    
]

In [None]:
import mlflow.sklearn

def run_experiment(experiment_name, suffix:str):
    results = []
    names = []

    try:
        experiment_id = mlflow.create_experiment(experiment_name)
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    for model in models:
        with mlflow.start_run(experiment_id=experiment_id, run_name=model.shortname):
            grid_search = GridSearchCV(
                estimator=model.model,
                param_grid=model.parameters,
                cv=5,  
                scoring='accuracy',
                n_jobs=-1,
            )

            # Train the model and find the best parameters
            grid_search.fit(X_train, y_train)
            mlflow.log_params(grid_search.best_params_)
            best_model = grid_search.best_estimator_
    
            mlflow.log_input(dataset_train, context="training")
            mlflow.log_input(dataset_test, context="evaluation")
            y_pred = best_model.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)
            mlflow.sklearn.log_model(best_model, f"credit-scoring-{suffix}", input_example=X.head(5))

In [None]:
run_experiment("Linear Models Comparison (with GridSearch)", "linmodels")