# Model training: hands-on

Train a model by defining a training function and executing it as batch in the platform.

In [None]:
import digitalhub as dh
import os

project = dh.get_or_create_project(f"mlflow-vision-{os.environ['USER']}")

# Step 1: Create a basic training function
Convert the code to a training procedure that can run locally or remotely on the server

In [None]:
# TO SPEED UP: use a pre-built image
# image="???"

In [None]:
train = project.new_function(
    "train-model",
    kind="python",
    python_version="PYTHON3_10",
    code_src="src/train0.py",
    handler="train_model",
    requirements=[
        "torch==2.9.1",
        "torchinfo==1.8.0",
        "torchmetrics==1.8.2",
        "torchvision==0.24.1",
    ],
)

In [None]:
# run the function locally
train_run = train.run(
    action="job",
    parameters={"epochs": 1, "batch_size": 64, "learning_rate": 1e-3},
    local_execution=True,
)

## Run in the cluster

We can run our code as batch job.
We'll need some resources to perform the training:
* disk space
* cpu?
* gpu?

In [None]:
# build an image to install all the dependencies *once*
build = train.run(action="build", wait=True)

In [None]:
# run the function with a volume of 10Gb
train_run = train.run(
    action="job",
    volumes=[
        {
            "volume_type": "ephemeral",
            "name": "volume-vision",
            "mount_path": "/data",
            "spec": {"size": "10Gi"},
        }
    ],
    parameters={"epochs": 3, "batch_size": 32},
    # wait=True,
)

# Step 2: Add metrics

We add metric function and log the metrics in our run. We also add evaluation function and measure model metrics.

In [None]:
train = project.new_function(
    "train-model",
    kind="python",
    python_version="PYTHON3_10",
    code_src="src/train1.py",
    handler="train_model",
    requirements=[
        "torch==2.9.1",
        "torchinfo==1.8.0",
        "torchmetrics==1.8.2",
        "torchvision==0.24.1",
    ],
)

In [None]:
train_run = train.run(
    action="job",
    parameters={"epochs": 1, "batch_size": 64, "learning_rate": 1e-3},
    local_execution=True,
)

## Exercise

Create different executions with different hyperparameters and compare them
```
train_run = train.run(
    action="job",
    parameters={...},
    local_execution=True,
)
```

# Step 3: Store model as MLFLow model

We use MLFLow to structure and pack the model artifact. Besides metrics, we also collect the model "signature" - the definition of the inference function that is associated with this model.

We also use platform as a model registry to store the model and its metadata.

In [None]:
train = project.new_function(
    "train-model",
    kind="python",
    python_version="PYTHON3_10",
    code_src="src/train.py",
    handler="train_model",
    requirements=[
        "torch==2.9.1",
        "torchinfo==1.8.0",
        "torchmetrics==1.8.2",
        "torchvision==0.24.1",
    ],
)

In [None]:
train_run = train.run(
    action="job",
    parameters={"epochs": 1, "batch_size": 64, "learning_rate": 1e-3},
    local_execution=True,
)