This notebook implements a first version of a working machine learning model to predict the age of an Abalone.

In [None]:
from pathlib import Path
from typing import Tuple

import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
DATA_PATH = Path("../data/abalone.csv")
EXPERIMENT = "xgb"
REGISTERED_MODEL = "xgb_regressor"
CATEGORICAL_COLS = ["Sex"]

In [None]:
# validate that mlflow runs locally
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

In [None]:
def read_transform_data(path: str) -> pd.DataFrame:
    """Given a path, loads the data as a pandas dataframe and transforms it.

    Parameters
    ----------
    path : string
           String represents the path to the csv file.

    Returns
    -------
    df : pd.Dataframe
         Transformed dataframe.
    """

    # read the csv
    df = pd.read_csv(path)
    # add age column
    df["age"] = df["Rings"] + 1.5
    # one-hot-encoding sex column
    df = pd.get_dummies(df, columns=CATEGORICAL_COLS, prefix=CATEGORICAL_COLS, drop_first=True)

    # drop Rings since we added age column
    df = df.drop(axis=1, columns="Rings")

    return df

In [None]:
def extract_x_y_split(
    df: pd.DataFrame, target: str = "age"
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """Splits a DataFrame into feature (X) and target (y) sets and further split them into
    training and testing sets using the train_test_split function.

    Parameters
    -----------
        df : pd.DataFrame
             The input DataFrame containing the dataset.

        target : str (optional)
                 The name of the target column in the DataFrame. Default is "age".

    Returns
    --------
        X_train : pd.DataFrame
                  The training feature set (X).
        X_test : pd.DataFrame
                 The testing feature set (X).
        y_train : pd.Series
                  The training target set (y).
        y_test : pd.Series
                 The testing target set (y).
    """

    # extracting X
    X = df.loc[:, df.columns != target]
    # extracting y
    y = df[target]

    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    return X_train, X_test, y_train, y_test

In [None]:
def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> xgb.XGBRegressor:
    """Given X_train and y_train, uses XGboost to train a model, returning the trained model.

    Parameters
    ----------
    X_train: pd.Dataframe
             Corresponds to the matrix with all the non target columns of the dataset.
    y_train: pd.Series
             Corresponds to the target column of the dataset.

    Returns
    -------
    model : xgb.XGBRregressor
            Trained XGboost model.
    """

    # initialize model
    model = xgb.XGBRegressor()
    # fit model
    model.fit(X_train, y_train)

    return model

In [None]:
def evaluate_model(y_true: pd.Series, y_pred: pd.Series) -> Tuple[float, float]:
    """Evaluate the model by calculating Mean Squared Error (MSE) and R-squared (R2) scores.

    Parameters
    -----------
        y_true : pd.Series
                 The true target values.
        y_pred : pd.Series
                 The predicted target values.

    Returns
    --------
        rmse : float
              The Root Mean Squared Error (RMSE) between the true and predicted values.
        r2 : float
             The R-squared (R2) score, which measures the goodness of fit of the model.
    """

    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

In [None]:
# create instance of MlflowClient
client = MlflowClient()

# retrieves a list of all experiments
experiments = client.search_experiments()
experiments

In [None]:
mlflow.set_experiment(EXPERIMENT)

with mlflow.start_run() as run:
    # get unique identifier of MLflow run
    run_id = run.info.run_id

    # set tags
    mlflow.set_tag("Task_type", "Regression")

    # read and transform data
    df = read_transform_data(DATA_PATH)

    # splitting data
    X_train, X_test, y_train, y_test = extract_x_y_split(df)
    mlflow.log_param("X_train_size", X_train.shape[0])
    mlflow.log_param("X_test_size", X_test.shape[0])

    # train model
    model = train_model(X_train, y_train)
    mlflow.log_params(model.get_params())

    # make prediction on X_test
    y_pred = model.predict(X_test)

    # evaluate on y_test
    mse, r2 = evaluate_model(y_test, y_pred)
    mlflow.log_metric("test_rmse", mse)
    mlflow.log_metric("test_r2", r2)

    # Log your model
    mlflow.xgboost.log_model(model, "model")

    # Register your model
    mlflow.register_model(f"runs:/{run_id}/model", REGISTERED_MODEL)