In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [1]:
cd ..

/Users/kaancaylan/Desktop/HEC/MLOPS/xhec-mlops-project-student


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import mlflow
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
def load_data(datapath):
    return pd.read_csv(datapath)


def preprocessing(df):
    df["age"] = df["Rings"] + 1.5
    X = df.drop("Rings", axis=1).copy()
    y = df.pop("age")
    return X, y


def get_pipeline(numerical_cols, categorical_cols):
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder()

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_cols),
            ("cat", categorical_transformer, categorical_cols),
        ]
    )

    model = LinearRegression()
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    return pipeline


def train_model(X, y):
    num_cols = X.select_dtypes(include=np.number).columns
    cat_cols = X.select_dtypes(include="object").columns
    pipeline = get_pipeline(num_cols, cat_cols)
    pipeline.fit(X, y)
    return pipeline


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return mean_squared_error(y_true, y_pred, squared=False)

In [4]:
mlflow.set_experiment("Abalone Age Prediction")


version = 0
with mlflow.start_run() as run:
    mlflow.sklearn.autolog()
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("release.version", f"2.2.{version}")
    df = load_data("data/abalone.csv")
    X, y = preprocessing(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = train_model(X_train, y_train)
    mlflow.log_params(model.get_params())

    y_pred = model.predict(X_test)
    rmse = evaluate_model(y_test, y_pred)

    mlflow.log_metric("Root mean Squared Error", rmse)
    mlflow.sklearn.log_model(model, artifact_path=f"model_{version}")

    result = mlflow.register_model(f"runs:/{run_id}/models", f"model_{version}")
    version += 1

Successfully registered model 'model_0'.
2023/10/23 11:31:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: model_0, version 1
Created version '1' of model 'model_0'.


In [6]:
client = mlflow.client.MlflowClient()

experiments = client.search_experiments()
experiments

[<Experiment: artifact_location='file:///Users/kaancaylan/Desktop/HEC/MLOPS/xhec-mlops-project-student/mlruns/426733563823885988', creation_time=1698053493059, experiment_id='426733563823885988', last_update_time=1698053493059, lifecycle_stage='active', name='Abalone Age Prediction', tags={}>,
 <Experiment: artifact_location='file:///Users/kaancaylan/Desktop/HEC/MLOPS/xhec-mlops-project-student/mlruns/0', creation_time=1698053492954, experiment_id='0', last_update_time=1698053492954, lifecycle_stage='active', name='Default', tags={}>]

In [7]:
!mlflow ui --host 0.0.0.0 --port 5002

[2023-10-23 11:33:33 +0200] [40696] [INFO] Starting gunicorn 21.2.0
[2023-10-23 11:33:33 +0200] [40696] [INFO] Listening at: http://0.0.0.0:5002 (40696)
[2023-10-23 11:33:33 +0200] [40696] [INFO] Using worker: sync
[2023-10-23 11:33:33 +0200] [40697] [INFO] Booting worker with pid: 40697
[2023-10-23 11:33:33 +0200] [40698] [INFO] Booting worker with pid: 40698
[2023-10-23 11:33:33 +0200] [40699] [INFO] Booting worker with pid: 40699
[2023-10-23 11:33:33 +0200] [40700] [INFO] Booting worker with pid: 40700
^C
[2023-10-23 11:34:22 +0200] [40696] [INFO] Handling signal: int
[2023-10-23 11:34:22 +0200] [40699] [INFO] Worker exiting (pid: 40699)
[2023-10-23 11:34:22 +0200] [40700] [INFO] Worker exiting (pid: 40700)
[2023-10-23 11:34:22 +0200] [40697] [INFO] Worker exiting (pid: 40697)
[2023-10-23 11:34:22 +0200] [40698] [INFO] Worker exiting (pid: 40698)
