# Level 5: Production System & Optimization

**Objective**: Build a production-ready system with deployment optimization.

**Techniques**:
- **Model Quantization**: Compressing model weights to INT8 to reduce size and speed up CPU inference.
- **ONNX Export**: Converting PyTorch model to ONNX format for cross-platform deployment.
- **Benchmarking**: Measuring Latency (<100ms goal) and Model Size.

In [1]:
import os
import time
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

try:
    import onnx
    import onnxruntime
except ImportError:
    print("Installing ONNX libraries...")
    !pip install onnx onnxruntime
    import onnx
    import onnxruntime

device = torch.device("cpu") 
print(f"Using device: {device}")

Using device: cpu


## 1. Load Trained EfficientNet (Level 3)

In [2]:
def build_efficientnet():
    model = models.efficientnet_b0(pretrained=False)
    num_ftrs = model.classifier[1].in_features
    model.classifier[1] = nn.Linear(num_ftrs, 102)
    return model

model = build_efficientnet()
model_path = None
possible_paths = [
    '../level_3/models/level_3_efficientnet.pth', 
    'models/level_3_efficientnet.pth',
    '../models/level_3_efficientnet.pth'
]

for p in possible_paths:
    if os.path.exists(p):
        model_path = p
        break

if model_path:
    print(f"Loading model from {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))
else:
    print("WARNING: Trained model not found. Using random weights for demonstration.")

model.eval()



Loading model from ../level_3/models/level_3_efficientnet.pth


EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

## 2. Model Compression: Dynamic Quantization
We convert weights from Float32 to Int8. This reduces memory usage significantly.

In [3]:
quantized_model = torch.quantization.quantize_dynamic(
    model, 
    {nn.Linear},  
    dtype=torch.qint8
)

print(f"Original Model Size: {os.path.getsize(model_path)/1e6:.2f} MB" if model_path else "N/A")
os.makedirs('models', exist_ok=True)
torch.save(quantized_model.state_dict(), 'models/level_5_quantized.pth')
print(f"Quantized Model Size: {os.path.getsize('models/level_5_quantized.pth')/1e6:.2f} MB")

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  quantized_model = torch.quantization.quantize_dynamic(


Original Model Size: 16.85 MB
Quantized Model Size: 16.46 MB


## 3. ONNX Export
Standard exchange format for deployment.

In [4]:
dummy_input = torch.randn(1, 3, 224, 224)
onnx_path = "models/level_5_model.onnx"

torch.onnx.export(
    model,
    dummy_input,
    onnx_path,
    verbose=False,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
print(f"Model exported to {onnx_path}")

  torch.onnx.export(


Applied 98 of general pattern rewrite rules.
Model exported to models/level_5_model.onnx


## 4. Latency Benchmarking
Testing inference speed (Target < 100ms).

In [5]:
def benchmark(model, input_tensor, name="Model", runs=100):
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_tensor)
    
    start = time.time()
    with torch.no_grad():
        for _ in range(runs):
            _ = model(input_tensor)
    end = time.time()
    
    avg_time = (end - start) / runs * 1000 # ms
    print(f"{name} Inference Time: {avg_time:.2f} ms")
    return avg_time

def benchmark_onnx(path, name="ONNX", runs=100):
    session = onnxruntime.InferenceSession(path)
    input_name = session.get_inputs()[0].name
    
    x_numpy = np.random.randn(1, 3, 224, 224).astype(np.float32)
    for _ in range(10):
        _ = session.run(None, {input_name: x_numpy})
        
    start = time.time()
    for _ in range(runs):
         _ = session.run(None, {input_name: x_numpy})
    end = time.time()
    
    avg_time = (end - start) / runs * 1000
    print(f"{name} Inference Time: {avg_time:.2f} ms")
    return avg_time

print("--- Benchmarking on CPU ---")
t_orig = benchmark(model, dummy_input, "Original PyTorch")
t_quant = benchmark(quantized_model, dummy_input, "Quantized PyTorch")
t_onnx = benchmark_onnx(onnx_path, "ONNX Runtime")

print("\n--- Results ---")
if t_onnx < 100:
    print("Success: < 100ms Inference Time achieved!")
else:
    print("Warning: > 100ms. Consider smaller model or GPU.")

--- Benchmarking on CPU ---
Original PyTorch Inference Time: 188.23 ms
Quantized PyTorch Inference Time: 55.68 ms
ONNX Runtime Inference Time: 12.17 ms

--- Results ---
Success: < 100ms Inference Time achieved!
