In [7]:
import mlflow
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import logging
logging.getLogger("mlflow").setLevel(logging.DEBUG)

In [14]:
# Указываем доступ в БД
# with open("../database.env", "r") as file:
with open("database.env", "r") as file:    

    lines = file.readlines()
    user = lines[0].split("=")[-1][:-1]
    password = lines[1].split("=")[-1][:-1]
    db = lines[2].split("=")[-1]
print(user, password, db)
sql_string = f"postgresql://{user}:{password}@postgres:5432/{db}"
mlflow.set_tracking_uri(sql_string)
# mlflow.set_tracking_uri("../data/mlruns")
# ARTIFACT_LOCATION = "../data/mlruns"
ARTIFACT_LOCATION = "/app/mlflow_artifacts"

mlflow_user magical_password mlflow_db


# Without scaling

In [15]:
random_state = 42

models = [
    LogisticRegression(solver="saga", random_state=random_state, n_jobs=-1),
    GaussianNB(),
    KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
     DecisionTreeClassifier(random_state=random_state),
     RandomForestClassifier(n_estimators=7, random_state=random_state, n_jobs=-1),
     MLPClassifier(
            hidden_layer_sizes=(45, 30, 15),
            solver="sgd",
            learning_rate_init=0.01,
            max_iter=500,
            random_state=random_state,
        ),
     AdaBoostClassifier(random_state=random_state),
     GradientBoostingClassifier(random_state=random_state),
     DummyClassifier(strategy="most_frequent", random_state=random_state),
]

print("Models appended...")

Models appended...


Loading the data:

In [16]:
%store -r X y

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [18]:
from mlflow.models import infer_signature
def run_experiments(experiment_name):
    try:
        experiment_id = mlflow.create_experiment(experiment_name,   artifact_location=ARTIFACT_LOCATION)
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    for model in models:
        with mlflow.start_run(run_name=model.__class__.__name__, experiment_id=experiment_id) as parent_run:
            kfold = KFold(n_splits=5, random_state=42, shuffle=True)
            mlflow.log_param("Model", model.__class__.__name__)
            for fold, (train_index, test_index) in enumerate(kfold.split(X_train)):
                X_train_fold = X_train.iloc[train_index]
                X_test_fold = X_train.iloc[test_index]
                y_train_fold = y_train.iloc[train_index]
                y_test_fold = y_train.iloc[test_index]
                
                signature = infer_signature(X_train_fold, y_train_fold)
                model.fit(X_train_fold, y_train_fold)

                # Start nested MLflow run
                with mlflow.start_run(run_name=f"Fold {fold}", nested=True, experiment_id=experiment_id):
                    # Log metrics
                    model_uri = mlflow.sklearn.log_model(
                        model, "model", signature=signature
                    ).model_uri 
                    mlflow.evaluate(
                        model_uri,
                        pd.concat([X_test_fold, y_test_fold], axis=1),
                        targets="Credit_Score",
                        model_type="classifier",
                    )
            with mlflow.start_run(run_name=f"Final", nested=True, experiment_id=experiment_id):
                # Log metrics
                mlflow.evaluate(
                    model_uri,
                    pd.concat([X_test, y_test], axis=1),
                    targets="Credit_Score",
                    model_type="classifier",
                )
                

In [19]:
run_experiments("Default Models Comparison (without scaling)")

2024/08/28 16:40:04 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/28 16:40:04 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/08/28 16:40:04 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/28 16:40:21 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/28 16:40:21 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/08/28 16:40:21 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/28 16:40:39 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024

<Figure size 1050x700 with 0 Axes>

# With scaling

In [None]:
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
run_experiment("Model Comparison Experiment (with default params and scaling)")