In [4]:
from dvclive import Live

In [25]:
# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
experiment = mlflow.set_experiment("fmnist_pytorch_Models2")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "fmnist_nn_test_s4"

# Define an artifact path that the model will be saved to.
#artifact_path = "fmnist_nn"

In [9]:
import torch

from torch import nn
from torch.utils.data import DataLoader
from torchinfo import summary
from torchmetrics import Accuracy
from torchvision import datasets
from torchvision.transforms import ToTensor
from pathlib import Path



In [7]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=64)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:17<00:00, 1522995.61it/s]


Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 253310.01it/s]


Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:03<00:00, 1376838.26it/s]


Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 55364812.80it/s]


Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw



In [8]:
# Get cpu or gpu for training.
device = "cuda" if torch.cuda.is_available() else "cpu"


# Define the model.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



In [13]:

def train(dataloader, model, loss_fn, metrics_fn, optimizer, live):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)
        accuracy = metrics_fn(pred, y)

        # Backpropagation.
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch
            # live.log_metric("loss", f"{loss:3f}", step=(batch // 100))
            # live.log_metric("accuracy", f"{accuracy:3f}", step=(batch // 100))
            print(
                f"loss: {loss:3f} accuracy: {accuracy:3f} [{current} / {len(dataloader)}]"
            )
        
        live.next_step()


epochs = 5
loss_fn = nn.CrossEntropyLoss()
metric_fn = Accuracy(task="multiclass", num_classes=10).to(device)
model = NeuralNetwork().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

with Live("results/train", report="notebook", save_dvc_exp=False) as live:
    params = {
        "epochs": epochs,
        "learning_rate": 1e-3,
        "batch_size": 64,
        "loss_function": loss_fn.__class__.__name__,
        "metric_function": metric_fn.__class__.__name__,
        "optimizer": "SGD",
    }
    # Log training parameters.
    live.log_params(params)

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, metric_fn, optimizer, live)

    models_dir = Path("models")
    models_dir.mkdir(exist_ok=True)
    torch.save(model,  (models_dir / "model.pth").absolute())

    # Save the trained model to MLflow.
    live.log_artifact(
      "models/model.pt",
      type="model",
      name="fmnist model",
      desc="fmnist trained model1",
      labels=["fmnist", "DL"],
    )


# DVC Report

params.yaml

|   epochs |   learning_rate |   batch_size | loss_function    | metric_function    | optimizer   |
|----------|-----------------|--------------|------------------|--------------------|-------------|
|        5 |           0.001 |           64 | CrossEntropyLoss | MulticlassAccuracy | SGD         |

metrics.json

|   step |
|--------|
|   4689 |


Epoch 1
-------------------------------
loss: 2.302391 accuracy: 0.125000 [0 / 938]
loss: 2.289646 accuracy: 0.187500 [100 / 938]
loss: 2.267756 accuracy: 0.250000 [200 / 938]
loss: 2.261332 accuracy: 0.234375 [300 / 938]
loss: 2.235206 accuracy: 0.296875 [400 / 938]
loss: 2.199858 accuracy: 0.343750 [500 / 938]
loss: 2.218194 accuracy: 0.281250 [600 / 938]
loss: 2.171138 accuracy: 0.375000 [700 / 938]
loss: 2.182675 accuracy: 0.250000 [800 / 938]
loss: 2.136302 accuracy: 0.359375 [900 / 938]
Epoch 2
-------------------------------
loss: 2.155216 accuracy: 0.281250 [0 / 938]
loss: 2.139678 accuracy: 0.296875 [100 / 938]
loss: 2.076361 accuracy: 0.437500 [200 / 938]
loss: 2.090477 accuracy: 0.421875 [300 / 938]
loss: 2.014881 accuracy: 0.531250 [400 / 938]
loss: 1.953932 accuracy: 0.546875 [500 / 938]
loss: 1.988659 accuracy: 0.390625 [600 / 938]
loss: 1.894318 accuracy: 0.484375 [700 / 938]
loss: 1.918155 accuracy: 0.390625 [800 / 938]
loss: 1.824279 accuracy: 0.546875 [900 / 938]
Epoc



Converting pt model to ONNX format and registering to MLFLOW

In [40]:
input = torch.randn(1,1,28,28).to(device)
onnx_program = torch.onnx.dynamo_export(model,input)



In [41]:
onnx_program.save("fmnist.onnx")

In [45]:
import onnx
onnx_model = onnx.load("fmnist.onnx")
onnx.checker.check_model(onnx_model)

In [47]:
mlflow.onnx.log_model(onnx_model, "pytorch_onnx_fmnist_model")




<mlflow.models.model.ModelInfo at 0x7f16ba83cca0>

Converting ONNX to TensorRT format

In [1]:
import numpy as np

BATCH_SIZE=2
PRECISION = np.float32

In [51]:
!trtexec --onnx=fmnist.onnx --saveEngine=fmnist_engine.trt

/bin/bash: line 1: trtexec: command not found


In [2]:
from onnx_helper import ONNXClassifierWrapper
N_CLASSES = 10 # Our ResNet-50 is trained on a 1000 class ImageNet task
trt_model = ONNXClassifierWrapper("fmnist_engine.trt", [BATCH_SIZE, N_CLASSES], target_dtype = PRECISION)

2024-05-31 09:56:31.593689: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-31 09:56:31.649800: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[05/31/2024-09:56:36] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


In [3]:
BATCH_SIZE=2
dummy_input_batch = np.zeros((BATCH_SIZE, 28, 28,1), dtype = PRECISION)


In [4]:
predictions = trt_model.predict(dummy_input_batch)


[05/31/2024-09:56:37] [TRT] [E] 3: [executionContext.cpp::enqueueV3::2491] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::enqueueV3::2491, condition: (mContext.profileObliviousBindings.at(profileObliviousIndex)) != nullptr )


In [59]:
mlflow.onnx.log_model(trt_model, "pytorch_onnx_trt_fmnist_model")

AttributeError: 'ONNXClassifierWrapper' object has no attribute 'graph'

In [35]:
#import mlflow.deployments
import mlflow.deployments


mlflow.deployments.create

deployment = mlflow.deployments.create_deployment("fmnist_deployment_trt","models:/fmnist_model/1","triton")

AttributeError: module 'mlflow.deployments' has no attribute 'create_deployment'