In [1]:
import mlflow
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from mlflow.models import infer_signature
# import logging

# logging.getLogger("mlflow").setLevel(logging.DEBUG)

In [2]:
# Указываем доступ в БД
# with open("../database.env", "r") as file:
with open("database.env", "r") as file:    

    lines = file.readlines()
    user = lines[0].split("=")[-1][:-1]
    password = lines[1].split("=")[-1][:-1]
    db = lines[2].split("=")[-1]
print(user, password, db)
sql_string = f"postgresql://{user}:{password}@postgres:5432/{db}"
mlflow.set_tracking_uri(sql_string)
# mlflow.set_tracking_uri("../data/mlruns")
# ARTIFACT_LOCATION = "../data/mlruns"
ARTIFACT_LOCATION = "/app/data/mlflow_artifacts"

mlflow_user magical_password mlflow_db


# Without scaling

In [3]:
random_state = 42

models = [
    LogisticRegression(solver="saga", random_state=random_state, n_jobs=-1),
    GaussianNB(),
    KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
    DecisionTreeClassifier(random_state=random_state),
    RandomForestClassifier(n_estimators=7, random_state=random_state, n_jobs=-1),
    MLPClassifier(
        hidden_layer_sizes=(45, 30, 15),
        solver="sgd",
        learning_rate_init=0.01,
        max_iter=500,
        random_state=random_state,
    ),
    AdaBoostClassifier(random_state=random_state),
    GradientBoostingClassifier(random_state=random_state),
    DummyClassifier(strategy="most_frequent", random_state=random_state)
]

print("Models appended...")

Models appended...


Loading the data:

In [18]:
processed = pd.read_pickle("data/processed_df.pkl")
X = processed.drop(columns=["Credit_Score"])
y = processed["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

In [19]:
X

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Credit_Mix_Standard,Credit_Mix__,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
0,23,19114.12,1824.843333,3,4,3,4,3,7,11.27,...,False,True,True,False,False,False,True,False,False,False
3,23,19114.12,1824.843333,3,4,3,4,5,4,6.27,...,False,False,True,False,False,False,False,False,False,True
4,23,19114.12,1824.843333,3,4,3,4,6,4,11.27,...,False,False,True,False,False,True,False,False,False,False
5,23,19114.12,1824.843333,3,4,3,4,8,4,9.27,...,False,False,True,False,False,False,False,False,False,False
6,23,19114.12,1824.843333,3,4,3,4,3,4,11.27,...,False,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99994,25,39628.99,3359.415833,4,6,7,2,20,6,9.50,...,False,True,True,False,False,True,False,False,False,False
99995,25,39628.99,3359.415833,4,6,7,2,23,7,11.50,...,False,True,True,False,True,False,False,False,False,False
99996,25,39628.99,3359.415833,4,6,7,2,18,7,11.50,...,False,True,True,False,False,True,False,False,False,False
99998,25,39628.99,3359.415833,4,6,7,2,20,1,11.50,...,False,False,True,False,False,False,False,True,False,False


In [5]:
# %store -r X y
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.33, random_state=42, stratify=y
# )

In [12]:
def run_experiments(experiment_name, suffix=None):
    try:
        experiment_id = mlflow.create_experiment(experiment_name,   artifact_location=ARTIFACT_LOCATION)
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    for model in models:
        if suffix == None:
            run_n = model.__class__.__name__
        else:
            run_n = model.__class__.__name__+suffix
        with mlflow.start_run(run_name=run_n, experiment_id=experiment_id) as parent_run:
            kfold = KFold(n_splits=5, random_state=42, shuffle=True)
            mlflow.log_param("Model", model.__class__.__name__)
            mlflow.log_params(model.model.get_params())
            for fold, (train_index, test_index) in enumerate(kfold.split(X_train)):
                X_train_fold = X_train.iloc[train_index]
                X_test_fold = X_train.iloc[test_index]
                y_train_fold = y_train.iloc[train_index]
                y_test_fold = y_train.iloc[test_index]
                
                signature = infer_signature(X_test_fold, y_test_fold)
                model.fit(X_train_fold, y_train_fold)

                # Start nested MLflow run
                with mlflow.start_run(run_name=f"Fold {fold}", nested=True, experiment_id=experiment_id):
                    mlflow.log_param("Model", model.__class__.__name__)
                    # Log metrics
                    model_uri = mlflow.sklearn.log_model(
                        model, "model", signature=signature
                    ).model_uri 
                    mlflow.evaluate(
                        model_uri,
                        pd.concat([X_test_fold, y_test_fold], axis=1),
                        targets="Credit_Score",
                        model_type="classifier",
                    )
                    mlflow.log_params(model.model.get_params())
            signature = infer_signature(X_test, y_test)
            model.fit(X_train, y_train)
            model_uri = mlflow.sklearn.log_model(
                    model, "model", signature=signature
                ).model_uri 
            mlflow.evaluate(
                model_uri,
                pd.concat([X_test, y_test], axis=1),
                targets="Credit_Score",
                model_type="classifier",
            )

In [8]:
run_experiments("Default Models Comparison (without scaling)")

2024/08/29 13:18:19 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/29 13:18:19 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/08/29 13:18:19 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/29 13:18:30 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/29 13:18:30 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/08/29 13:18:31 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/29 13:18:42 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024

<Figure size 1050x700 with 0 Axes>

# With scaling

In [24]:
std_scaler = StandardScaler().set_output(transform="pandas")
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)
X_train

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Credit_Mix_Standard,Credit_Mix__,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
21218,0.061845,0.683500,0.573484,-0.953959,0.252577,-1.579269,0.231387,0.410329,-0.905534,-0.613394,...,-0.847324,-0.502556,1.387300,-1.081249,-0.354210,-0.476633,-0.372896,-0.349210,2.627905,-0.608213
55646,1.359144,0.961410,1.041592,-2.154176,-0.245518,-1.579269,-1.513686,-1.566132,-0.738044,0.141082,...,-0.847324,-0.502556,1.387300,-1.081249,-0.354210,-0.476633,2.681716,-0.349210,-0.380531,-0.608213
53697,0.061845,1.639857,1.566141,-0.153815,-0.743614,1.831133,0.231387,-0.933665,0.434385,-0.086534,...,1.180187,-0.502556,-0.720825,-1.081249,2.823185,-0.476633,-0.372896,-0.349210,-0.380531,-0.608213
3401,-0.586804,0.410584,0.451641,-0.153815,-0.245518,-0.168068,-0.641150,-1.091781,-0.403065,0.375064,...,1.180187,-0.502556,1.387300,-1.081249,2.823185,-0.476633,-0.372896,-0.349210,-0.380531,-0.608213
7661,-0.772133,-0.576694,-0.461440,0.246257,0.750673,-0.991269,-0.204881,-0.617431,-0.570554,0.991060,...,1.180187,-0.502556,-0.720825,-1.081249,-0.354210,-0.476633,-0.372896,-0.349210,-0.380531,-0.608213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4050,-0.494140,-0.401277,-0.398861,0.646330,0.750673,1.125532,-0.641150,0.726563,1.606813,-1.380603,...,-0.847324,-0.502556,-0.720825,0.924856,-0.354210,-0.476633,-0.372896,2.863604,-0.380531,-0.608213
74581,0.617831,-0.818893,-0.715745,0.646330,0.750673,0.890332,2.412728,0.568446,1.104344,-0.521074,...,-0.847324,-0.502556,-0.720825,0.924856,2.823185,-0.476633,-0.372896,-0.349210,-0.380531,-0.608213
60633,-1.235454,-0.941403,-0.910977,1.846546,2.244961,1.125532,1.540192,2.465849,1.104344,2.643266,...,-0.847324,-0.502556,-0.720825,0.924856,-0.354210,-0.476633,-0.372896,-0.349210,-0.380531,1.644162
37654,1.266480,-0.211094,-0.100994,-0.953959,-0.245518,-1.344069,-1.513686,-0.538372,-1.575493,0.023294,...,-0.847324,-0.502556,1.387300,-1.081249,2.823185,-0.476633,-0.372896,-0.349210,-0.380531,-0.608213


In [25]:
run_experiments("Model Comparison Experiment (with default params and scaling)", "_with scaling")

2024/08/29 13:43:16 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/29 13:43:16 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/08/29 13:43:17 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/29 13:43:44 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/29 13:43:44 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/08/29 13:43:44 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/08/29 13:44:11 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024