In [21]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from mlflow.data.pandas_dataset import PandasDataset
from sklearn.base import ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, ElasticNet
from tqdm import tqdm
from mlflow.models import infer_signature
import mlflow.sklearn

In [23]:
# %store -r X y
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.33, random_state=42, stratify=y
# )

In [24]:
std_scaler = StandardScaler().set_output(transform="pandas")
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

# dataset_train = mlflow.data.from_pandas(
#     X_train, targets=y_train, name="credit-score-classification-train"
# )
# dataset_test = mlflow.data.from_pandas(
#     X_test, targets=y_test, name="credit-score-classification-test"
# )

In [25]:
# Указываем доступ в БД
# with open("../database.env", "r") as file:
with open("database.env", "r") as file:    
    lines = file.readlines()
    user = lines[0].split("=")[-1][:-1]
    password = lines[1].split("=")[-1][:-1]
    db = lines[2].split("=")[-1]
print(user, password, db)
sql_string = f"postgresql://{user}:{password}@postgres:5432/{db}"
mlflow.set_tracking_uri(sql_string)

mlflow_user magical_password mlflow_db


In [26]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, ElasticNet

models = [
    [
        
        LogisticRegression(random_state=42),
        {
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": [
                "liblinear",
                "saga",
            ],  # solvers that support multiclass classification
            "penalty": ["l1", "l2", "elasticnet"],  # l1 = Lasso, l2 = Ridge
            "max_iter": [100, 200, 500],
        },
    
    ]
    
]

In [31]:
def run_experiment(experiment_name, suffix=None):
    try:
        experiment_id = mlflow.create_experiment(experiment_name)
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
        
    for model in models:
        if suffix == None:
            run_n = model[0].__class__.__name__
        else:
            run_n = model[0].__class__.__name__+suffix
        with mlflow.start_run(experiment_id=experiment_id, run_name=run_n):
            mlflow.log_param("Model", model[0].__class__.__name__)
            grid_search = GridSearchCV(
                estimator=model[0],
                param_grid=model[1],
                cv=5,  
                scoring='accuracy',
                n_jobs=-1,
            )
            signature = infer_signature(X_test, y_test)
            # Train the model and find the best parameters
            grid_search.fit(X_train, y_train)
            mlflow.log_params(grid_search.best_params_)
            best_model = grid_search.best_estimator_
    
            # mlflow.log_input(dataset_train, context="training")
            # mlflow.log_input(dataset_test, context="evaluation")
            y_pred = best_model.predict(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')
            model_uri = mlflow.sklearn.log_model(
                                best_model, "model", signature=signature
                            ).model_uri 
            mlflow.evaluate(
                model_uri,
                pd.concat([X_test, y_test], axis=1),
                targets="Credit_Score",
                model_type="classifier",
            )
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)

In [32]:
run_experiment("Linear Models Comparison (with GridSearch on scaled)", "_linmodels")

150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/envs/mlflow_env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/envs/mlflow_env/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/mlflow_env/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^

<Figure size 1050x700 with 0 Axes>