In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from mlflow.data.pandas_dataset import PandasDataset
from sklearn.base import ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, ElasticNet
from tqdm import tqdm
import mlflow
from mlflow.models import infer_signature
from catboost import Pool, CatBoostClassifier

In [14]:
processed = pd.read_pickle("data/processed_df.pkl")
X = processed.drop(columns=["Credit_Score"])
y = processed["Credit_Score"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)
std_scaler = StandardScaler().set_output(transform="pandas")
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [15]:
def run_catboost_experiment(experiment_name, catboost_params, suffix=None):
    try:
        experiment_id = mlflow.create_experiment(experiment_name)
    except mlflow.exceptions.MlflowException:
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    
    if suffix is None:
        run_n = "CatBoost"
    else:
        run_n = "CatBoost" + suffix

    with mlflow.start_run(experiment_id=experiment_id, run_name=run_n):
        # mlflow.log_params(catboost_params)
        
        # Prepare the dataset for CatBoost
        train_pool = Pool(X_train, y_train)
        test_pool = Pool(X_test, y_test)
        base_params = {
            'logging_level':'Silent',
            'loss_function':'MultiClass',
            'l2_leaf_reg': 3,
            'random_strength': 1.2,
        }
        cat_param = CatBoostClassifier(**base_params)
        search = cat_param.grid_search(catboost_params, train_pool, cv=5)
        cat_param.fit(train_pool, use_best_model=True, eval_set=test_pool)

        # Extract the best model parameters (for logging)
        best_params = search['params']
        mlflow.log_params(best_params)

        # Train the best model on the full training set
        model = CatBoostClassifier(**(best_params | base_params))
        model.fit(train_pool, verbose=False)
        
        # Model evaluation
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Log model and evaluation metrics
        signature = infer_signature(X_test, y_test)
        model_uri = mlflow.catboost.log_model(
            model, "model", signature=signature
        ).model_uri
        
        mlflow.evaluate(
            model_uri,
            test_pool,
            targets="Credit_Score",
            model_type="classifier",
        )
        
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

In [16]:
run_catboost_experiment("GridSearchCV_", {
    'depth': [2, 3, 4],
    'learning_rate': [0.02, 0.03, 0.04],
    'iterations' : [100, 300, 500, 800],
    })



0:	loss: 0.7415884	best: 0.7415884 (0)	total: 605ms	remaining: 21.2s
1:	loss: 0.7137650	best: 0.7137650 (1)	total: 1.17s	remaining: 19.9s
2:	loss: 0.6980854	best: 0.6980854 (2)	total: 1.7s	remaining: 18.7s
3:	loss: 0.6830882	best: 0.6830882 (3)	total: 3.48s	remaining: 27.9s
4:	loss: 0.6738794	best: 0.6738794 (4)	total: 5.1s	remaining: 31.6s
5:	loss: 0.6673806	best: 0.6673806 (5)	total: 6.85s	remaining: 34.3s
6:	loss: 0.6710913	best: 0.6673806 (5)	total: 9.67s	remaining: 40s
7:	loss: 0.6630521	best: 0.6630521 (7)	total: 12.2s	remaining: 42.7s
8:	loss: 0.6570961	best: 0.6570961 (8)	total: 14.7s	remaining: 44.1s
9:	loss: 0.6609704	best: 0.6570961 (8)	total: 18.5s	remaining: 48.1s
10:	loss: 0.6537773	best: 0.6537773 (10)	total: 22.3s	remaining: 50.6s
11:	loss: 0.6486150	best: 0.6486150 (11)	total: 26.2s	remaining: 52.4s
12:	loss: 0.7183731	best: 0.6486150 (11)	total: 26.8s	remaining: 47.4s
13:	loss: 0.6898757	best: 0.6486150 (11)	total: 27.4s	remaining: 43s
14:	loss: 0.6768585	best: 0.6486

2024/08/29 15:50:16 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/08/29 15:50:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run CatBoost at: http://0.0.0.0:5000/#/experiments/21/runs/6ad8f41ad7a146269d439347f2f8169c.
2024/08/29 15:50:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://0.0.0.0:5000/#/experiments/21.


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)