### Install Dependencies

In [1]:
%pip install mlflow torchmetrics torchinfo
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Jinja2<4,>=3.0 (from mlflow)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Using cached markdown-3.8-py3-none-any.whl.metadata (5.1 kB)
Collecting

### Import Required Packages

In [26]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchinfo import summary
from torchmetrics import Accuracy
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np

import mlflow
from mlflow.types import Schema, TensorSpec
from mlflow.models import ModelSignature

In [3]:
training_data = datasets.FashionMNIST(
  root="data",
  train=True,
  download=True,
  transform=ToTensor(),
)

test_data = datasets.FashionMNIST(
  root="data",
  train=False,
  download=True,
  transform=ToTensor(),
)

100.0%
100.0%
100.0%
100.0%


In [4]:
print(f"Image size: {training_data[0][0].shape}")
print(f"Size of training dataset: {len(training_data)}")
print(f"Size of test dataset: {len(test_data)}")

Image size: torch.Size([1, 28, 28])
Size of training dataset: 60000
Size of test dataset: 10000


In [20]:
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

In [5]:
class ImageClassifier(nn.Module):
  def __init__(self):
      super().__init__()
      self.model = nn.Sequential(
          nn.Conv2d(1, 8, kernel_size=3),
          nn.ReLU(),
          nn.Conv2d(8, 16, kernel_size=3),
          nn.ReLU(),
          nn.Flatten(),
          nn.LazyLinear(10),  # 10 classes in total.
      )

  def forward(self, x):
      return self.model(x)

### Open a Terminal to start the MLFlow Tracking Server

In [None]:
! mlflow server --host 127.0.0.1 --port 8081

^C


INFO:waitress:Serving on http://127.0.0.1:8081
Running the mlflow server failed. Please see the logs above for details.


### Start the Tracking Server URI

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8081")

### Create a new MLflow Experiment

In [7]:
mlflow.set_experiment("/Experiments/mlflow-pytorch-quickstart")

2025/05/09 19:40:59 INFO mlflow.tracking.fluent: Experiment with name '/Experiments/mlflow-pytorch-quickstart' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/reach/Documents/AI/MLOps/MLFlow/Projects/Logistic_Regression/mlruns/975146762046751879', creation_time=1746790859334, experiment_id='975146762046751879', last_update_time=1746790859334, lifecycle_stage='active', name='/Experiments/mlflow-pytorch-quickstart', tags={}>

In [8]:
# Get cpu or gpu for training.
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
def train(dataloader, model, loss_fn, metrics_fn, optimizer, epoch):
  """Train the model on a single pass of the dataloader.

  Args:
      dataloader: an instance of `torch.utils.data.DataLoader`, containing the training data.
      model: an instance of `torch.nn.Module`, the model to be trained.
      loss_fn: a callable, the loss function.
      metrics_fn: a callable, the metrics function.
      optimizer: an instance of `torch.optim.Optimizer`, the optimizer used for training.
      epoch: an integer, the current epoch number.
  """
  model.train()
  for batch, (X, y) in enumerate(dataloader):
      X, y = X.to(device), y.to(device)

      pred = model(X)
      loss = loss_fn(pred, y)
      accuracy = metrics_fn(pred, y)

      # Backpropagation.
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      if batch % 100 == 0:
          loss, current = loss.item(), batch
          step = batch // 100 * (epoch + 1)
          mlflow.log_metric("loss", f"{loss:2f}", step=step)
          mlflow.log_metric("accuracy", f"{accuracy:2f}", step=step)
          print(f"loss: {loss:2f} accuracy: {accuracy:2f} [{current} / {len(dataloader)}]")

In [11]:
def evaluate(dataloader, model, loss_fn, metrics_fn, epoch):
  """Evaluate the model on a single pass of the dataloader.

  Args:
      dataloader: an instance of `torch.utils.data.DataLoader`, containing the eval data.
      model: an instance of `torch.nn.Module`, the model to be trained.
      loss_fn: a callable, the loss function.
      metrics_fn: a callable, the metrics function.
      epoch: an integer, the current epoch number.
  """
  num_batches = len(dataloader)
  model.eval()
  eval_loss, eval_accuracy = 0, 0
  with torch.no_grad():
      for X, y in dataloader:
          X, y = X.to(device), y.to(device)
          pred = model(X)
          eval_loss += loss_fn(pred, y).item()
          eval_accuracy += metrics_fn(pred, y)

  eval_loss /= num_batches
  eval_accuracy /= num_batches
  mlflow.log_metric("eval_loss", f"{eval_loss:2f}", step=epoch)
  mlflow.log_metric("eval_accuracy", f"{eval_accuracy:2f}", step=epoch)

  print(f"Eval metrics: Accuracy: {eval_accuracy:.2f}, Avg loss: {eval_loss:2f} ")

In [12]:
epochs = 3
loss_fn = nn.CrossEntropyLoss()
metric_fn = Accuracy(task="multiclass", num_classes=10).to(device)
model = ImageClassifier().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [27]:

# Set the model signature
input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 28, 28))])
output_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 10))])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

# Start an MLflow run
with mlflow.start_run() as run:
  params = {
      "epochs": epochs,
      "learning_rate": 1e-3,
      "batch_size": 64,
      "loss_function": loss_fn.__class__.__name__,
      "metric_function": metric_fn.__class__.__name__,
      "optimizer": "SGD",
  }
  # Log training parameters.
  mlflow.log_params(params)

  # Log model summary
  with open('model_summary.txt', "w", encoding="utf-8") as f:
      f.write(str(summary(model)))
  mlflow.log_artifact('model_summary.txt')  

  for t in range(epochs):
      print(f"Epoch {t + 1}-------------------------------")
      train(train_dataloader, model, loss_fn, metric_fn, optimizer, epoch=t)
      evaluate(test_dataloader, model, loss_fn, metric_fn, epoch=0)

  # Save the trained model to MLflow.
  mlflow.pytorch.log_model(model, "model", signature=signature)

Epoch 1-------------------------------
loss: 0.447348 accuracy: 0.812500 [0 / 938]
loss: 0.569907 accuracy: 0.734375 [100 / 938]
loss: 0.374351 accuracy: 0.859375 [200 / 938]
loss: 0.650207 accuracy: 0.765625 [300 / 938]
loss: 0.581302 accuracy: 0.703125 [400 / 938]
loss: 0.580364 accuracy: 0.796875 [500 / 938]
loss: 0.576874 accuracy: 0.765625 [600 / 938]
loss: 0.619337 accuracy: 0.796875 [700 / 938]
loss: 0.674312 accuracy: 0.734375 [800 / 938]
loss: 0.526939 accuracy: 0.796875 [900 / 938]
Eval metrics: Accuracy: 0.79, Avg loss: 0.572654 
Epoch 2-------------------------------
loss: 0.436815 accuracy: 0.812500 [0 / 938]
loss: 0.546893 accuracy: 0.734375 [100 / 938]
loss: 0.361582 accuracy: 0.859375 [200 / 938]
loss: 0.624199 accuracy: 0.781250 [300 / 938]
loss: 0.568069 accuracy: 0.703125 [400 / 938]
loss: 0.562512 accuracy: 0.781250 [500 / 938]
loss: 0.554829 accuracy: 0.796875 [600 / 938]
loss: 0.616821 accuracy: 0.812500 [700 / 938]
loss: 0.680242 accuracy: 0.750000 [800 / 938]
lo



Eval metrics: Accuracy: 0.79, Avg loss: 0.561819 




### Register the model

In [29]:
model_uri = f"runs:/{run.info.run_id}/model"
mv = mlflow.register_model(model_uri, "CNNModel")
print(f"Name: {mv.name}")
print(f"Version: {mv.version}")

Name: CNNModel
Version: 2


Registered model 'CNNModel' already exists. Creating a new version of this model...
Created version '2' of model 'CNNModel'.


### Stop the MLFlow Tracking Server

In [None]:
# Run the following commands on the Terminal
netstat -ano | findstr : <8081>
taskkill /PID <12548> /F