## Importing Libraries











































































































































































In [1]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlflow.models import infer_signature
from pprint import pprint
from typing import Union, Dict, List, Tuple
import numpy as np
import joblib
import os
import math
import mlflow
import optuna
import warnings

SEED = 42
DATASET_PATH = "/media/greca/HD/Datasets/Obesity Dataset/ObesityDataSet.csv"
ARTIFACTS_OUTPUT_PATH = os.path.join("..", "models", "artifacts")
FEATURES_OUTPUT_PATH = os.path.join("..", "models", "features")
FEATURE_SELECTION_EXPERIMENT_NAME = "feature-selection-experimentation"
HYPERPARAMETER_TUNING_EXPERIMENT_NAME = "hyperparameters-tuning-experimentation"

warnings.filterwarnings("ignore")

## Loading Essentials

In [2]:
# loading features
X_train = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "X_train.pkl"))
y_train = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "y_train.pkl"))

X_valid = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "X_valid.pkl"))
y_valid = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "y_valid.pkl"))

# loading artifacts
sc = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "features_sc.pkl"))
ohe = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "features_ohe.pkl"))
ohe_label = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "label_ohe.pkl"))

## Feature Selection Experimentation

In [3]:
# creating the baseline models
dt = DecisionTreeClassifier(random_state=SEED)
rf = RandomForestClassifier(random_state=SEED, verbose=0)
xg = XGBClassifier(random_state=SEED)
lg = LGBMClassifier(random_state=SEED, verbose=-1, objective="multiclass")
cb = CatBoostClassifier(random_seed=SEED, verbose=0, allow_writing_files=False)

In [4]:
def apply_feature_selection(
    model: Union[DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier],
    number_features: int,
    X_train: np.ndarray,
    y_train: np.array,
    X_valid: np.ndarray,
    y_valid: np.array,
) -> Dict:
    # initializing and fitting the sfs class
    sfs = SequentialFeatureSelector(
        model,
        n_features_to_select=number_features,
        cv=3
    )
    sfs.fit(X=X_train, y=y_train)

    # getting the indexes of the best features
    selected_features_indexes = np.argwhere(sfs.get_support()).reshape(-1)

    reduced_X_train = sfs.transform(X_train)
    reduced_X_valid = sfs.transform(X_valid)

    # training the model
    model.fit(reduced_X_train, y_train)

    # calculating the training f1 score
    predicted_y_train = model.predict(reduced_X_train)
    train_f1 = f1_score(
        y_true=y_train,
        y_pred=predicted_y_train,
        average="weighted"
    )
    
    # calculating the validation f1 score
    predicted_y_valid = model.predict(reduced_X_valid)
    valid_f1 = f1_score(
        y_true=y_valid,
        y_pred=predicted_y_valid,
        average="weighted"
    )

    # inferring the signature of the trained model
    signature = infer_signature(
        model_input=reduced_X_train,
        model_output=predicted_y_train
    )
    
    # saving the metrics and artifacts that we want to log in mlflow
    results = {
        "train_f1": train_f1,
        "valid_f1": valid_f1,
        "indexes_features": selected_features_indexes,
        "model": model,
        "model_signature": signature
    }

    return results

def set_configurations_mlflow(
    model: Union[DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier],
    y_train: np.array,
    y_valid: np.array,
) -> Tuple[np.array, np.array, str, str]:
    # reshaping the target values (if needed) and setting the run name and which
    # flavor is being used for each machine learning model
    if isinstance(model, DecisionTreeClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "decision_tree"
        flavor = "sklearn"
    
    if isinstance(model, RandomForestClassifier):
        run_name = "random_forest"
        flavor = "sklearn"
    
    if isinstance(model, XGBClassifier):
        run_name = "xgboost"
        flavor = "xgboost"
    
    if isinstance(model, LGBMClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "lightgbm"
        flavor = "lightgbm"
    
    if isinstance(model, CatBoostClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "catboost"
        flavor = "catboost"
    
    # disabling some options of the current flavor's autolog
    if flavor == "sklearn":
        mlflow.sklearn.autolog(
            log_models=False,
            log_post_training_metrics=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "xgboost":
        mlflow.xgboost.autolog(
            log_models=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "lightgbm":
        mlflow.lightgbm.autolog(
            log_models=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "catboost":
        # there is no autolog implemented for catboost
        pass

    return y_train, y_valid, run_name, flavor

def run_feature_selection_experiment(
    models: List,
    min_features: int,
    max_features: int,
    experiment_id: str
) -> None:
    for model in models:
        # reshaping the target values (if needed) and setting some mlflow's configuration
        new_y_train, new_y_valid, run_name, flavor = set_configurations_mlflow(
            model=model,
            y_train=y_train,
            y_valid=y_valid
        )
        
        # starting a new run for the current model
        with mlflow.start_run(experiment_id=experiment_id, run_name=run_name):
            pprint(f"Starting the run for the {run_name} model!\n")

            for i, n_features in enumerate(range(min_features, max_features + 1)):
                # creating a nested run inside the model's main run
                with mlflow.start_run(
                    experiment_id=experiment_id,
                    run_name=f"{run_name}_experiment_{i}",
                    nested=True
                ):
                    # running the feature selection main function
                    results = apply_feature_selection(
                        model=model,
                        number_features=n_features,
                        X_train=X_train,
                        y_train=new_y_train,
                        X_valid=X_valid,
                        y_valid=new_y_valid
                    )

                    # logging the trained model
                    if flavor == "sklearn":
                        mlflow.sklearn.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model"s default parameters
                        mlflow.log_params(results["model"].get_params(deep=True))
                    elif flavor == "xgboost":
                        mlflow.xgboost.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_params(deep=True))
                    elif flavor == "lightgbm":
                        mlflow.lightgbm.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_params())
                    elif flavor == "catboost":
                        mlflow.catboost.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_all_params())

                    # logging the training and validation scores
                    mlflow.log_metric("train_f1", results["train_f1"])
                    mlflow.log_metric("valid_f1", results["valid_f1"])

                    # logging the artifacts (original dataset, features, and encoders objects)
                    mlflow.log_artifact(DATASET_PATH)
                    mlflow.log_artifact(ARTIFACTS_OUTPUT_PATH)
                    mlflow.log_artifact(FEATURES_OUTPUT_PATH)

                    # logging the indexes of the best features
                    mlflow.log_param("indexes_features", results["indexes_features"])
                    

In [5]:
models = [dt, rf, xg, lg, cb]
min_features = math.floor(X_train.shape[1] * 0.2)
max_features = math.floor(X_train.shape[1] * 0.5)

# creating a new mlflow's experiment
experiment_id = mlflow.create_experiment(
    name=FEATURE_SELECTION_EXPERIMENT_NAME,
    tags={"version": "v1"}
)

# running the feature selection experiments
run_feature_selection_experiment(
    models=models,
    min_features=min_features,
    max_features=max_features,
    experiment_id=experiment_id
)

'Starting the run for the decision_tree model!\n'
'Starting the run for the random_forest model!\n'
'Starting the run for the xgboost model!\n'
'Starting the run for the lightgbm model!\n'
'Starting the run for the catboost model!\n'


## Hyperparameters Tuning

In [6]:
class Objective:
    def __init__(
        self,
        run_name: str,
        experiment_id: str,
        X_train: np.ndarray,
        y_train: np.array,
        X_valid: np.ndarray,
        y_valid: np.array,
        indexes: List
    ) -> None:
        self.run_name = run_name
        self.experiment_id = experiment_id
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.indexes = indexes

        if self.run_name in ["decision_tree", "lightgbm", "catboost"]:
            self.y_train = np.argmax(self.y_train, axis=1)
            self.y_valid = np.argmax(self.y_valid, axis=1)
        
        self.X_train = self.X_train[:, self.indexes]
        self.X_valid = self.X_valid[:, self.indexes]
    
    def __call__(
        self,
        trial: optuna.trial.Trial
    ) -> float:
        with mlflow.start_run(experiment_id=self.experiment_id, nested=True):
            if self.run_name == "decision_tree":
                params = {
                    "max_depth": trial.suggest_int("max_depth", 2, 32, step=2),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 8, step=1),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 6, step=1),
                    "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", 0, 0.5, step=0.1),
                    "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 16, step=2),
                    "random_state": SEED
                }
                model = DecisionTreeClassifier(**params)
            
            if self.run_name == "random_forest":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
                    "max_depth": trial.suggest_int("max_depth", 10, 50),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
                    "random_state": SEED
                }
                model = RandomForestClassifier(**params)
            
            if self.run_name == "xgboost":
                params = {
                    "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
                    "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
                    "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
                    "random_state": SEED
                }
                model = XGBClassifier(**params)
            
            if self.run_name == "lightgbm":
                params = {
                    "objective": "multiclass",
                    "verbosity": -1,
                    "random_state": SEED,
                    "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                    "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                    "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
                    "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                }
                model = LGBMClassifier(**params)
            
            if self.run_name == "catboost":
                params = {
                    "random_seed": SEED,
                    "verbose": 0,
                    "allow_writing_files": False,
                    "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
                    "depth": trial.suggest_int("depth", 1, 12),
                    "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
                    "bootstrap_type": trial.suggest_categorical(
                        "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
                    )
                }
                model = CatBoostClassifier(**params)
            
            model.fit(X=self.X_train, y=self.y_train)

            # calculating the training f1 score
            train_prediction = model.predict(self.X_train)
            train_f1 = f1_score(
                y_true=self.y_train,
                y_pred=train_prediction,
                average="weighted"
            )

            # calculating the validation f1 score
            valid_prediction = model.predict(self.X_valid)
            valid_f1 = f1_score(
                y_true=self.y_valid,
                y_pred=valid_prediction,
                average="weighted"
            )

            # logging the training and validation scores
            mlflow.log_metric("train_f1", train_f1)
            mlflow.log_metric("valid_f1", valid_f1)

            # inferring the signature of the trained model
            signature = infer_signature(
                model_input=self.X_train,
                model_output=train_prediction
            )

            # saving the trained model
            if self.run_name in ["decision_tree", "random_forest"]:
                # sklearn flavor
                mlflow.sklearn.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model"s default parameters
                mlflow.log_params(model.get_params(deep=True))
            elif self.run_name == "xgboost":
                mlflow.xgboost.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_params())
            elif self.run_name == "lightgbm":
                mlflow.lightgbm.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_params())
            elif self.run_name == "catboost":
                mlflow.catboost.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_all_params())

        return valid_f1

In [7]:
# creating a new mlflow's experiment
hpt_experiment_id = mlflow.create_experiment(
    name=HYPERPARAMETER_TUNING_EXPERIMENT_NAME,
    tags={"version": "v1"}
)

### Decision Tree

In [8]:
dt_run_name = "decision_tree"
dt_features_indexes = [0, 6, 18, 22, 30, 31, 35]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=dt_run_name):
    objective = Objective(
        run_name=dt_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=dt_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-08-18 11:36:36,009] A new study created in memory with name: no-name-e30ed03b-9d5c-4f10-bc78-d770deca1f02
[I 2024-08-18 11:36:37,472] Trial 0 finished with value: 0.1386782580697094 and parameters: {'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.30000000000000004, 'max_leaf_nodes': 14}. Best is trial 0 with value: 0.1386782580697094.
[I 2024-08-18 11:36:38,910] Trial 1 finished with value: 0.4453364322868359 and parameters: {'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.0, 'max_leaf_nodes': 6}. Best is trial 1 with value: 0.4453364322868359.
[I 2024-08-18 11:36:40,329] Trial 2 finished with value: 0.14366693418642412 and parameters: {'max_depth': 2, 'min_samples_split': 7, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.5, 'max_leaf_nodes': 8}. Best is trial 1 with value: 0.4453364322868359.
[I 2024-08-18 11:36:41,771] Trial 3 finished with value: 0.1386782580697094 and parameters

#### Random Forest

In [9]:
rf_run_name = "random_forest"
rf_features_indexes = [4, 7, 23, 24, 26, 30, 35]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=rf_run_name):
    objective = Objective(
        run_name=rf_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=rf_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-08-18 11:39:01,868] A new study created in memory with name: no-name-d06a057c-8b49-4ef4-949a-dc836beb6a8f
[I 2024-08-18 11:39:06,778] Trial 0 finished with value: 0.9461334237536497 and parameters: {'n_estimators': 874, 'max_depth': 15, 'min_samples_split': 29, 'min_samples_leaf': 26}. Best is trial 0 with value: 0.9461334237536497.
[I 2024-08-18 11:39:08,834] Trial 1 finished with value: 0.9639100834903497 and parameters: {'n_estimators': 151, 'max_depth': 43, 'min_samples_split': 23, 'min_samples_leaf': 13}. Best is trial 1 with value: 0.9639100834903497.
[I 2024-08-18 11:39:11,959] Trial 2 finished with value: 0.9571859231539557 and parameters: {'n_estimators': 416, 'max_depth': 42, 'min_samples_split': 21, 'min_samples_leaf': 15}. Best is trial 1 with value: 0.9639100834903497.
[I 2024-08-18 11:39:13,930] Trial 3 finished with value: 0.9400480184690827 and parameters: {'n_estimators': 141, 'max_depth': 32, 'min_samples_split': 31, 'min_samples_leaf': 28}. Best is trial 1 wi

#### XGBoost

In [10]:
xgb_run_name = "xgboost"
xg_features_indexes = [0, 1, 25, 28, 29, 30, 35]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=xgb_run_name):
    objective = Objective(
        run_name=xgb_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=xg_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-08-18 11:46:44,343] A new study created in memory with name: no-name-70385929-f301-43d4-9b30-d21628f8b098
[I 2024-08-18 11:46:45,693] Trial 0 finished with value: 0.24642067440574905 and parameters: {'booster': 'gblinear', 'lambda': 2.7352345201801407e-08, 'alpha': 0.05015636062716765}. Best is trial 0 with value: 0.24642067440574905.
[I 2024-08-18 11:46:47,177] Trial 1 finished with value: 0.9792502367980368 and parameters: {'booster': 'gbtree', 'lambda': 2.7754746503634844e-06, 'alpha': 0.1390503921572048}. Best is trial 1 with value: 0.9792502367980368.
[I 2024-08-18 11:46:48,537] Trial 2 finished with value: 0.974459991209231 and parameters: {'booster': 'gbtree', 'lambda': 0.01823780890112637, 'alpha': 8.241519277313381e-07}. Best is trial 1 with value: 0.9792502367980368.
[I 2024-08-18 11:46:49,905] Trial 3 finished with value: 0.9745735893207031 and parameters: {'booster': 'gbtree', 'lambda': 1.0215036638452673e-07, 'alpha': 1.6367461562000383e-08}. Best is trial 1 with v

#### LightGBM

In [11]:
lg_run_name = "lightgbm"
lg_features_indexes = [0, 1, 4, 8, 11, 26, 30]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=lg_run_name):
    objective = Objective(
        run_name=lg_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=lg_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-08-18 11:51:28,650] A new study created in memory with name: no-name-8f3d6300-6a42-4878-9f53-f2228da0f6f1
[I 2024-08-18 11:51:30,039] Trial 0 finished with value: 0.9613387071553067 and parameters: {'lambda_l1': 1.1815811552753403e-05, 'lambda_l2': 7.153812453020983e-06, 'num_leaves': 54, 'feature_fraction': 0.8805681615290555, 'bagging_fraction': 0.6320561933302578, 'bagging_freq': 6, 'min_child_samples': 94}. Best is trial 0 with value: 0.9613387071553067.
[I 2024-08-18 11:51:31,446] Trial 1 finished with value: 0.9517682709729492 and parameters: {'lambda_l1': 4.3240182903601654e-05, 'lambda_l2': 0.00011854427095107251, 'num_leaves': 220, 'feature_fraction': 0.474544518488729, 'bagging_fraction': 0.9306748631530094, 'bagging_freq': 6, 'min_child_samples': 36}. Best is trial 0 with value: 0.9613387071553067.
[I 2024-08-18 11:51:32,805] Trial 2 finished with value: 0.9678157965308484 and parameters: {'lambda_l1': 0.0010538967589520949, 'lambda_l2': 0.0051915678113138245, 'num_l

#### CatBoost

In [12]:
cb_run_name = "catboost"
cb_features_indexes = [3, 14, 18, 25, 26, 30, 35]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=cb_run_name):
    objective = Objective(
        run_name=cb_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=cb_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-08-18 11:53:50,407] A new study created in memory with name: no-name-ba1bd251-221f-45b0-af99-4206439c63e6
[I 2024-08-18 11:53:52,247] Trial 0 finished with value: 0.9517188678414851 and parameters: {'colsample_bylevel': 0.041268266909325946, 'depth': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian'}. Best is trial 0 with value: 0.9517188678414851.
[I 2024-08-18 11:53:55,397] Trial 1 finished with value: 0.9645406474421601 and parameters: {'colsample_bylevel': 0.0941175372444998, 'depth': 2, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.9645406474421601.
[I 2024-08-18 11:53:58,550] Trial 2 finished with value: 0.9645406474421601 and parameters: {'colsample_bylevel': 0.07613857498999393, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.9645406474421601.
[I 2024-08-18 11:54:01,507] Trial 3 finished with value: 0.7675973902065887 and parameters: {'colsample_bylevel': 0.011143148290756499, 