# Training pipeline

In [1]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("bank-deposit")

<Experiment: artifact_location='mlflow-artifacts:/845276055540234593', creation_time=1761777281289, experiment_id='845276055540234593', last_update_time=1761777281289, lifecycle_stage='active', name='bank-deposit', tags={}>

In [2]:
from dataset import load_dataset
# import random
# random_state = random.randint(0, 10**4)
random_state = 77
print(f"Current random state {random_state}")

MODEL_TYPE_TAG = "model_type"

Current random state 77


In [None]:
from sklearn.model_selection import train_test_split
X_data, y_data = load_dataset("../data/train.csv")
X_train, X_test, y_train, y_test = train_test_split(
    X_data,
    y_data,
    test_size=0.3,
    stratify=y_data,
    random_state=random_state
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test,
    y_test,
    test_size=0.5,
    stratify=y_data,
    random_state=random_state
)

  encoded['education'] = encoded['education'].replace({
  encoded['housing'] = encoded['housing'].replace({"no": False, "yes": True})
  encoded['loan'] = encoded['loan'].replace({"no": False, "yes": True})
  encoded['default'] = encoded['default'].replace({"no": False, "yes": True})


## Logistic Regression

Vanilla, simple CV

In [5]:
from datetime import datetime
from pathlib import Path
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score
)

mlflow.sklearn.autolog(log_models=True)

linear_regression_models_path = Path("../models/linear_regression/")
linear_regression_models_path.mkdir(exist_ok=True, parents=True)

str_time = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

tags = {MODEL_TYPE_TAG: "logistic_regression"}

with mlflow.start_run(run_name=f"Logistic Regression {str_time}", tags=tags) as current_run:
    param = {
        'C':[0.01,0.1,1,10],
        'penalty':['l1','l2'],
        'solver':['liblinear']
    }

    log_reg = LogisticRegression(intercept_scaling=True, dual=False, fit_intercept=True)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    grid = GridSearchCV(log_reg, param_grid=param, scoring='roc_auc', cv=cv, n_jobs=-1)
    grid.fit(X_train, y_train)
    print(grid.best_score_, grid.best_params_)

    model_name = f"{current_run.info.run_name}.joblib"
    model_path = str(linear_regression_models_path / model_name)

    joblib.dump(grid, model_path)

    mlflow.sklearn.log_model(
        sk_model=grid,
        name=current_run.info.run_id
    )

    preds = grid.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    pre = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1:        {f1:.4f}")

    mlflow.log_metrics({
        "test_accuracy": float(acc),
        "test_precision": float(pre),
        "test_recall": float(recall),
        "test_f1": float(f1)
    })

    

2025/11/02 15:12:56 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


üèÉ View run charming-asp-853 at: http://localhost:5000/#/experiments/845276055540234593/runs/5dd27b523917404aa558887591fc8c60
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run selective-bird-382 at: http://localhost:5000/#/experiments/845276055540234593/runs/b4a9f035f9554acdb30244404643a45e
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run enthused-ox-580 at: http://localhost:5000/#/experiments/845276055540234593/runs/3eb343f7fb164b03b292313a417ba9b1
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run dazzling-shrike-674 at: http://localhost:5000/#/experiments/845276055540234593/runs/5a9ae54ff736488ab197d977c3b4fcc1
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run funny-deer-418 at: http://localhost:5000/#/experiments/845276055540234593/runs/b69a4056ce834fe4ba746e6d2a95b025
üß™ View experiment at: http://localh



Accuracy:  0.9033
Precision: 0.6539
Recall:    0.4155
F1:        0.5082
üèÉ View run Logistic Regression 2025-11-02 15:11:19 at: http://localhost:5000/#/experiments/845276055540234593/runs/dc1e6eb04b0f412faabb5d860cb9c0a5
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593


## Logistic regression with balanced weights

In [6]:
from datetime import datetime
from pathlib import Path
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score
)

mlflow.sklearn.autolog(log_models=True)

linear_regression_models_path = Path("../models/linear_regression_balanced/")
linear_regression_models_path.mkdir(exist_ok=True, parents=True)

str_time = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

tags = {MODEL_TYPE_TAG: "logistic_regression_weighted"}

with mlflow.start_run(run_name=f"Logistic Regression Weighted {str_time}", tags=tags) as current_run:
    param = {
        'C':[0.01,0.1,1,10],
        'penalty':['l1','l2'],
        'solver':['liblinear']
    }

    log_reg = LogisticRegression(
        intercept_scaling=True,
        dual=False,
        fit_intercept=True,
        class_weight='balanced'
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    grid = GridSearchCV(log_reg, param_grid=param, scoring='roc_auc', cv=cv, n_jobs=-1)
    grid.fit(X_train, y_train)
    print(grid.best_score_, grid.best_params_)

    model_name = f"{current_run.info.run_name}.joblib"
    model_path = str(linear_regression_models_path / model_name)

    joblib.dump(grid, model_path)

    mlflow.sklearn.log_model(
        sk_model=grid,
        name=current_run.info.run_id
    )

    preds = grid.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    pre = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1:        {f1:.4f}")

    mlflow.log_metrics({
        "test_accuracy": float(acc),
        "test_precision": float(pre),
        "test_recall": float(recall),
        "test_f1": float(f1)
    })

2025/11/02 15:15:17 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


üèÉ View run youthful-bee-941 at: http://localhost:5000/#/experiments/845276055540234593/runs/a49ed819f5464e268b04f9ce776aa777
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run awesome-stoat-622 at: http://localhost:5000/#/experiments/845276055540234593/runs/88cc91022bf4497d9a097a80a5881333
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run salty-mole-932 at: http://localhost:5000/#/experiments/845276055540234593/runs/9b700af982a843959d3cf5f83a0a04af
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run clumsy-sheep-497 at: http://localhost:5000/#/experiments/845276055540234593/runs/4f27411c409040819cceb8c170fe0b85
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run gregarious-asp-100 at: http://localhost:5000/#/experiments/845276055540234593/runs/b4c1bfcd959d43209a77a0b9ab831926
üß™ View experiment at: http://localho



Accuracy:  0.8595
Precision: 0.4547
Recall:    0.8433
F1:        0.5909
üèÉ View run Logistic Regression Weighted 2025-11-02 15:13:37 at: http://localhost:5000/#/experiments/845276055540234593/runs/93ec3ac25e33480a86f3b4164ddbf0d5
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593


## XGBoost

In [None]:
from datetime import datetime

from xgboost import XGBClassifier
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score,
    confusion_matrix
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold

mlflow.xgboost.autolog(log_models=True)

str_time = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

tags = {MODEL_TYPE_TAG: "xgboost"}

with mlflow.start_run(run_name=f"XGBoost {str_time}", tags=tags):

    xgb_model = XGBClassifier(
        objective='binary:logistic',
        n_jobs=7,
        random_state=random_state,
        eval_metric=["logloss", "auc"]
    )

    # Parameter grid to tune
    param_grid = {
        'n_estimators': [50, 200, 450],
        'max_depth': [3, 5, 10, 15],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=kfold, n_jobs=3)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print("Best parameters:", best_params)

    # Final training 
    bst = XGBClassifier(
        **best_params,
        objective='binary:logistic',
        n_jobs=-1,
        random_state=random_state,
        eval_metric=["logloss", "auc"]
    )

    bst.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)]
    )

    print("Logging model...")
    mlflow.xgboost.log_model(xgb_model=bst, name=current_run.info.run_id, model_format="json")

    preds = bst.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    pre = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1:        {f1:.4f}")

    mlflow.log_metrics({
        "test_accuracy": float(acc),
        "test_precision": float(pre),
        "test_recall": float(recall),
        "test_f1": float(f1)
    })

    cm = confusion_matrix(y_test, preds)
    print("Confusion matrix:\n", cm)

    

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
 /home/riccardo/Projects/personale/bank-deposit-predictor/.venv/lib/python3.12/site-packages/pydevd_plugins/extensions/pydevd_plugin_omegaconf.py
 /home/riccardo/Projects/personale/bank-deposit-predictor/.venv/lib/python3.12/site-packages/pydevd_plugins/extensions/pydevd_plugin_omegaconf.py
 /home/riccardo/Projects/personal

## Final comparison
Let's review the results from the tested models: