In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
import mlflow
from mlflow.tracking import MlflowClient

import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [None]:
# validate that mlflow runs locally
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

In [None]:
data_path = Path("../data/abalone.csv")

def read_transform_data(path):
    # read the csv
    df = pd.read_csv(data_path)
    # add age column
    df["age"] = df["Rings"]+1.5
    # one-hot-encoding sex column
    df = pd.get_dummies(df, columns=["Sex"],
                        prefix=["Sex"],
                        drop_first=True)
    
    df.drop(axis=1, columns="Rings")

    return df

In [None]:
def extract_x_y_split(
        df: pd.DataFrame,
        target: str = "age"
):
    
    X = df.loc[:, df.columns!=target]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)


    return X_train, X_test, y_train, y_test

In [None]:
def train_model(X_train: pd.DataFrame,
                y_train: pd.DataFrame):
    
    model = xgb.XGBRegressor()

    model.fit(X_train, y_train)

    return model

In [None]:
def evaluate_model(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    mse = mean_squared_error(y_true, y_pred, squared=False)
    r2  = r2_score(y_true, y_pred)
    return mse, r2

In [None]:
# create instance of MlflowClient
client = MlflowClient()

# 
experiments = client.search_experiments()
experiments

In [None]:
mlflow.set_experiment(f"xgb")

with mlflow.start_run() as run:
    # get unique identifier of MLflow run
    run_id = run.info.run_id

    # set tags
    mlflow.set_tag("Task_type", "Regression")

    # read and transform data
    df = read_transform_data(data_path)

    # splitting data
    X_train, X_test, y_train, y_test = extract_x_y_split(df)
    mlflow.log_param("X_train_size", X_train.shape[0])
    mlflow.log_param("X_test_size", X_test.shape[0])

    # train model 
    model = train_model(X_train, y_train)
    mlflow.log_params(model.get_params())

    # make prediction on X_test
    y_pred = model.predict(X_test)

    # evaluate on y_test
    mse, r2 = evaluate_model(y_test, y_pred)
    mlflow.log_metric("test_mse", mse)
    mlflow.log_metric("test_r2", r2)

    # Log your model
    mlflow.xgboost.log_model(model, "model")

    # Register your model
    mlflow.register_model(f"runs:/{run_id}/model", "xgb_regressor")