# Step 1: Create a basic training function
Convert the code to a training procedure that can run locally or remotely on the server

In [4]:
import digitalhub as dh
import os

project = dh.get_or_create_project(f"mlflow-vision-{os.environ['USER']}")

In [None]:
# TO SPEED UP: use custom image
image="???"

In [12]:
train = project.new_function("train-model", 
                             kind="python", 
                             python_version="PYTHON3_10",
                             code_src="src/train0.py",
                             handler="train_model",
                             requirements=["torch==2.9.1", "torchinfo==1.8.0", "torchmetrics==1.8.2", "torchvision==0.24.1"]
                            )

In [5]:
#build = train.run(action="build")

In [13]:
train_run = train.run(action="job",
                      parameters={"epochs": 1, "batch_size": 64, "learning_rate": 1e-3},
                      local_execution=True
                     )

2025-11-28 11:40:24,776 - INFO - Validating task.
2025-11-28 11:40:24,778 - INFO - Validating run.
2025-11-28 11:40:24,778 - INFO - Starting task.
2025-11-28 11:40:24,779 - INFO - Configuring execution.
2025-11-28 11:40:24,782 - INFO - Composing function arguments.
2025-11-28 11:40:25,119 - INFO - Executing run.


Epoch 1 -------------------------------
loss: 2.335743 [0 / 938]
loss: 2.086574 [100 / 938]
loss: 1.649300 [200 / 938]
loss: 1.345437 [300 / 938]
loss: 0.975903 [400 / 938]
loss: 0.879724 [500 / 938]
loss: 0.861404 [600 / 938]
loss: 0.707937 [700 / 938]
loss: 0.760996 [800 / 938]
loss: 0.782689 [900 / 938]


2025-11-28 11:41:52,805 - INFO - Collecting outputs.
2025-11-28 11:41:52,806 - INFO - Task completed, returning run status.


## Run on kubernetes: a volume needed to store data

In [None]:
train_run = train.run(action="job",
                      volumes=[{
                        "volume_type": "persistent_volume_claim",
                        "name": "volume-vision",
                        "mount_path": "/data",
                        "spec": { "size": "10Gi" }}],
                      parameters={"epochs": 3, "batch_size": 32}
                     )

# Step 2: Add metrics

We add metric function and log the metrics in our run. We also add evaluation function and measure model metrics.

In [14]:
train = project.new_function("train-model", 
                             kind="python", 
                             python_version="PYTHON3_10",
                             code_src="src/train1.py",
                             handler="train_model",
                             requirements=["torch==2.9.1", "torchinfo==1.8.0", "torchmetrics==1.8.2", "torchvision==0.24.1"]
                            )

In [15]:
train_run = train.run(action="job",
                      parameters={"epochs": 1, "batch_size": 64, "learning_rate": 1e-3},
                      local_execution=True
                     )

2025-11-28 11:49:48,628 - INFO - Validating task.
2025-11-28 11:49:48,629 - INFO - Validating run.
2025-11-28 11:49:48,630 - INFO - Starting task.
2025-11-28 11:49:48,630 - INFO - Configuring execution.
2025-11-28 11:49:48,633 - INFO - Composing function arguments.
2025-11-28 11:49:48,706 - INFO - Executing run.


Epoch 1 -------------------------------
loss: 2.308895 accuracy: 0.062500 [0 / 938]
loss: 2.288398 accuracy: 0.125000 [100 / 938]
loss: 2.257147 accuracy: 0.343750 [200 / 938]
loss: 2.235667 accuracy: 0.343750 [300 / 938]
loss: 2.179476 accuracy: 0.531250 [400 / 938]
loss: 2.139218 accuracy: 0.546875 [500 / 938]
loss: 2.023072 accuracy: 0.562500 [600 / 938]
loss: 1.868633 accuracy: 0.578125 [700 / 938]
loss: 1.665142 accuracy: 0.671875 [800 / 938]
loss: 1.404152 accuracy: 0.687500 [900 / 938]


2025-11-28 11:51:40,212 - INFO - Collecting outputs.
2025-11-28 11:51:40,213 - INFO - Task completed, returning run status.


Eval metrics:  Accuracy: 0.68, Avg loss: 1.330133 


## Exercise: create different executions with different hyperparameters and compare them

# Step 3: Store model as MLFLow model

We use MLFLow to structure and pack the model artifact. Besides metrics, we also collect the model "signature" - the definition of the inference function that is associated with this model.

We also use platform as a model registry to store the model and its metadata.

In [18]:
train = project.new_function("train-model", 
                             kind="python", 
                             python_version="PYTHON3_10",
                             code_src="src/train.py",
                             handler="train_model",
                             requirements=["torch==2.9.1", "torchinfo==1.8.0", "torchmetrics==1.8.2", "torchvision==0.24.1"]
                            )

In [19]:
train_run = train.run(action="job",
                      parameters={"epochs": 1, "batch_size": 64, "learning_rate": 1e-3},
                      local_execution=True
                     )

2025-11-28 11:59:21,224 - INFO - Validating task.
2025-11-28 11:59:21,225 - INFO - Validating run.
2025-11-28 11:59:21,226 - INFO - Starting task.
2025-11-28 11:59:21,226 - INFO - Configuring execution.
2025-11-28 11:59:21,230 - INFO - Composing function arguments.
2025-11-28 11:59:21,306 - INFO - Executing run.


Epoch 1 -------------------------------
loss: 2.322660 accuracy: 0.000000 [0 / 938]
loss: 2.251162 accuracy: 0.156250 [100 / 938]
loss: 2.094125 accuracy: 0.406250 [200 / 938]
loss: 1.879757 accuracy: 0.578125 [300 / 938]
loss: 1.376790 accuracy: 0.625000 [400 / 938]
loss: 1.113296 accuracy: 0.671875 [500 / 938]
loss: 0.993981 accuracy: 0.718750 [600 / 938]
loss: 0.794922 accuracy: 0.781250 [700 / 938]
loss: 0.829920 accuracy: 0.718750 [800 / 938]
loss: 0.822783 accuracy: 0.750000 [900 / 938]
Eval metrics:  Accuracy: 0.73, Avg loss: 0.756330 


2025-11-28 12:01:16,136 - INFO - Collecting outputs.
INFO:digitalhub-core:Collecting outputs.
2025-11-28 12:01:16,138 - INFO - Task completed, returning run status.
INFO:digitalhub-core:Task completed, returning run status.
