# Training pipeline

In [1]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("bank-deposit")

<Experiment: artifact_location='mlflow-artifacts:/845276055540234593', creation_time=1761777281289, experiment_id='845276055540234593', last_update_time=1761777281289, lifecycle_stage='active', name='bank-deposit', tags={}>

In [11]:
from dataset import load_dataset
# import random
# random_state = random.randint(0, 10**4)
random_state = 77
print(f"Current random state {random_state}")

MODEL_TYPE_TAG = "model_type"

Current random state 77


In [3]:
from sklearn.model_selection import train_test_split
X_data, y_data = load_dataset("../data/train.csv")
X_train, X_test, y_train, y_test = train_test_split(
    X_data,
    y_data,
    test_size=0.2,
    stratify=y_data,
    random_state=random_state
)

  encoded['education'] = encoded['education'].replace({
  encoded['housing'] = encoded['housing'].replace({"no": False, "yes": True})
  encoded['loan'] = encoded['loan'].replace({"no": False, "yes": True})
  encoded['default'] = encoded['default'].replace({"no": False, "yes": True})


## Logistic Regression

Vanilla, simple CV

In [None]:
from datetime import datetime
from pathlib import Path
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score
)

mlflow.sklearn.autolog(log_models=True)

linear_regression_models_path = Path("../models/linear_regression/")
linear_regression_models_path.mkdir(exist_ok=True, parents=True)

str_time = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

tags = {MODEL_TYPE_TAG: "xgboost"}

with mlflow.start_run(run_name=f"Logistic Regression {str_time}", tags=tags) as current_run:
    param = {
        'C':[0.01,0.1,1,10],
        'penalty':['l1','l2'],
        'solver':['liblinear']
    }

    log_reg = LogisticRegression(intercept_scaling=True, dual=False, fit_intercept=True)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    grid = GridSearchCV(log_reg, param_grid=param, scoring='roc_auc', cv=cv, n_jobs=-1)
    grid.fit(X_train, y_train)
    print(grid.best_score_, grid.best_params_)

    model_name = f"{current_run.info.run_name}.joblib"
    model_path = str(linear_regression_models_path / model_name)

    joblib.dump(grid, model_path)

    mlflow.sklearn.log_model(
        sk_model=grid,
        name=current_run.info.run_name
    )

    preds = grid.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    pre = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1:        {f1:.4f}")

    mlflow.log_metrics({
        "test_accuracy": float(acc),
        "test_precision": float(pre),
        "test_recall": float(recall),
        "test_f1": float(f1)
    })

    

2025/10/30 21:46:44 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


üèÉ View run learned-boar-497 at: http://localhost:5000/#/experiments/845276055540234593/runs/b071d2f8329646afbb1307e5ecd412ce
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run intelligent-toad-852 at: http://localhost:5000/#/experiments/845276055540234593/runs/5410d96eb1b44137b163f46988aec316
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run gifted-crow-878 at: http://localhost:5000/#/experiments/845276055540234593/runs/def4e8a768c4435fbfe58a9164459690
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run puzzled-roo-716 at: http://localhost:5000/#/experiments/845276055540234593/runs/8741b10dd4a644fcaa9d33a346e84591
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run resilient-sow-87 at: http://localhost:5000/#/experiments/845276055540234593/runs/083329150caf4f2eb8888a329df7f114
üß™ View experiment at: http://localh



üèÉ View run loud-shad-432 at: http://localhost:5000/#/experiments/845276055540234593/runs/f963eb51144d42948424990b5b2bfdfa
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593


## Logistic regression with balanced weights

In [None]:
from datetime import datetime
from pathlib import Path
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score
)

mlflow.sklearn.autolog(log_models=True)

linear_regression_models_path = Path("../models/linear_regression_balanced/")
linear_regression_models_path.mkdir(exist_ok=True, parents=True)

str_time = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

tags = {MODEL_TYPE_TAG: "xgboost"}

with mlflow.start_run(run_name=f"Logistic Regression Weighted {str_time}", tags=tags) as current_run:
    param = {
        'C':[0.01,0.1,1,10],
        'penalty':['l1','l2'],
        'solver':['liblinear']
    }

    log_reg = LogisticRegression(
        intercept_scaling=True,
        dual=False,
        fit_intercept=True,
        class_weight='balanced'
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    grid = GridSearchCV(log_reg, param_grid=param, scoring='roc_auc', cv=cv, n_jobs=-1)
    grid.fit(X_train, y_train)
    print(grid.best_score_, grid.best_params_)

    model_name = f"{current_run.info.run_name}.joblib"
    model_path = str(linear_regression_models_path / model_name)

    joblib.dump(grid, model_path)

    mlflow.sklearn.log_model(
        sk_model=grid,
        name=current_run.info.run_name
    )

    preds = grid.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    pre = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1:        {f1:.4f}")

    mlflow.log_metrics({
        "test_accuracy": float(acc),
        "test_precision": float(pre),
        "test_recall": float(recall),
        "test_f1": float(f1)
    })

2025/10/30 21:57:34 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


üèÉ View run efficient-doe-270 at: http://localhost:5000/#/experiments/845276055540234593/runs/0d66396326a54aafa6761e9612eb547a
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run likeable-doe-573 at: http://localhost:5000/#/experiments/845276055540234593/runs/c220abd57c6a4835bb0053fa1a4c06b6
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run charming-quail-609 at: http://localhost:5000/#/experiments/845276055540234593/runs/dadda85c4b03403dbc06f0f0693acbba
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run colorful-frog-112 at: http://localhost:5000/#/experiments/845276055540234593/runs/ce78cf247dc8468ab5924ea0816cb6d3
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593
üèÉ View run merciful-yak-713 at: http://localhost:5000/#/experiments/845276055540234593/runs/ddbc0280284445af91218d706b2186dc
üß™ View experiment at: http://loca



üèÉ View run caring-skunk-511 at: http://localhost:5000/#/experiments/845276055540234593/runs/0c29e9b19de04fb997e8ea1e722418a4
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593


## XGBoost

In [12]:
from datetime import datetime

from xgboost import XGBClassifier
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score,
    confusion_matrix
)

mlflow.xgboost.autolog(log_models=True)

str_time = datetime.today().strftime("%Y-%m-%d %H:%M:%S")

tags = {MODEL_TYPE_TAG: "xgboost"}

with mlflow.start_run(run_name=f"XGBoost {str_time}", tags=tags):
    bst = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.05,
        objective='binary:logistic',
        n_jobs=8,
        random_state=random_state,
        eval_metric=["logloss", "auc"]
    )

    bst.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)]
    )

    print("Logging model...")
    mlflow.xgboost.log_model(xgb_model=bst, name="model_json", model_format="json")

    preds = bst.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    pre = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1:        {f1:.4f}")

    mlflow.log_metrics({
        "test_accuracy": float(acc),
        "test_precision": float(pre),
        "test_recall": float(recall),
        "test_f1": float(f1)
    })

    cm = confusion_matrix(y_test, preds)
    print("Confusion matrix:\n", cm)

    

[0]	validation_0-logloss:0.34762	validation_0-auc:0.92708	validation_1-logloss:0.34762	validation_1-auc:0.92610
[1]	validation_0-logloss:0.33185	validation_0-auc:0.93004	validation_1-logloss:0.33187	validation_1-auc:0.92905
[2]	validation_0-logloss:0.31874	validation_0-auc:0.93134	validation_1-logloss:0.31881	validation_1-auc:0.93032
[3]	validation_0-logloss:0.30757	validation_0-auc:0.93210	validation_1-logloss:0.30766	validation_1-auc:0.93108
[4]	validation_0-logloss:0.29780	validation_0-auc:0.93412	validation_1-logloss:0.29795	validation_1-auc:0.93307
[5]	validation_0-logloss:0.28917	validation_0-auc:0.93472	validation_1-logloss:0.28937	validation_1-auc:0.93367
[6]	validation_0-logloss:0.28149	validation_0-auc:0.93555	validation_1-logloss:0.28172	validation_1-auc:0.93432
[7]	validation_0-logloss:0.27454	validation_0-auc:0.93581	validation_1-logloss:0.27481	validation_1-auc:0.93471
[8]	validation_0-logloss:0.26832	validation_0-auc:0.93689	validation_1-logloss:0.26860	validation_1-auc:



Logging model...




Accuracy:  0.9212
Precision: 0.7209
Recall:    0.5623
F1:        0.6318
Confusion matrix:
 [[124397   3815]
 [  7673   9856]]
üèÉ View run XGBoost 2025-10-30 22:06:34 at: http://localhost:5000/#/experiments/845276055540234593/runs/c4816900ea3146d8afea933a8c6916b7
üß™ View experiment at: http://localhost:5000/#/experiments/845276055540234593


## Final comparison
Let's review the results from the tested models: