In [None]:
import mlflow
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# Указываем доступ в БД
with open("../database.env", "r") as file:

    lines = file.readlines()
    user = lines[0].split("=")[-1][:-1]
    password = lines[1].split("=")[-1][:-1]
    db = lines[2].split("=")[-1]
print(user, password, db)
sql_string = f"postgresql://{user}:{password}@0.0.0.0/{db}"
mlflow.set_tracking_uri(sql_string)

# Without scaling

In [None]:
random_state = 42

models = []

# Logistic Regression supports n_jobs with certain solvers like 'saga'
models.append(
    (
        "Logistic Regression",
        LogisticRegression(solver="saga", random_state=random_state, n_jobs=-1),
    )
)

# Naive Bayes does not support n_jobs
models.append(("Naive Bayes", GaussianNB()))

# KNeighborsClassifier supports n_jobs
models.append(
    (
        "K-Nearest Neighbour",
        KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
    )
)

# Decision Tree does not support n_jobs
models.append(
    (
        "Decision Tree",
        DecisionTreeClassifier(random_state=random_state),
    )
)

# RandomForestClassifier supports n_jobs
models.append(
    (
        "Random Forest",
        RandomForestClassifier(n_estimators=7, random_state=random_state, n_jobs=-1),
    )
)

# MLPClassifier supports n_jobs
models.append(
    (
        "MLP:",
        MLPClassifier(
            hidden_layer_sizes=(45, 30, 15),
            solver="sgd",
            learning_rate_init=0.01,
            max_iter=500,
            random_state=random_state,
        ),
    )
)

# AdaBoostClassifier supports n_jobs
models.append(
    (
        "AdaBoostClassifier",
        AdaBoostClassifier(random_state=random_state),
    )
)

# GradientBoostingClassifier supports n_jobs
models.append(
    (
        "GradientBoostingClassifier",
        GradientBoostingClassifier(random_state=random_state),
    )
)

# Dummy Classifier does not support n_jobs
models.append(
    (
        "Dummy Classifier (mode)",
        DummyClassifier(strategy="most_frequent", random_state=random_state),
    )
)
print("Models appended...")

Loading the data:

In [None]:
%store -r X y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [None]:
def run_experiment(experiment_name):
    results = []
    names = []
    try:
        experiment_id = mlflow.create_experiment(experiment_name)
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    for name, model in models:
        kfold = KFold(n_splits=5, random_state=0, shuffle=True)
        names.append(name)
        if name != "Dummy Classifier (mode)":
            cv_result = cross_val_score(
                model,
                X_train,
                y_train.values.ravel(),
                cv=kfold,
                scoring="accuracy",
                n_jobs=-1,
            )

            results.append(cv_result)
            model.fit(X_train, y_train)
        else:
            model.fit(X_train, y_train)
            results.append(accuracy_score(y_train, model.predict(X_train)))
        test_accuracy = accuracy_score(y_test, model.predict(X_test))

        with mlflow.start_run(experiment_id=experiment_id, run_name=name):
            mlflow.log_param("model_name", name)
            if name != "Dummy Classifier (mode)":
                mlflow.log_metric("train_accuracy", results[-1].mean())
            else:
                mlflow.log_metric("train_accuracy", results[-1])
            mlflow.log_metric("test_accuracy", test_accuracy)

In [None]:
run_experiment("Model Comparison Experiment (with default params and NO scaling)")

# With scaling

In [None]:
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
run_experiment("Model Comparison Experiment (with default params and scaling)")