# Level 5: Production System & Optimization

**Objective**: Build a production-ready system with deployment optimization.

**Techniques**:
- **Model Quantization**: Compressing model weights to INT8 to reduce size and speed up CPU inference.
- **ONNX Export**: Converting PyTorch model to ONNX format for cross-platform deployment.
- **Benchmarking**: Measuring Latency (<100ms goal) and Model Size.

In [None]:
import os
import time
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

try:
    import onnx
    import onnxruntime
except ImportError:
    print("Installing ONNX libraries...")
    !pip install onnx onnxruntime
    import onnx
    import onnxruntime

device = torch.device("cpu") 
print(f"Using device: {device}")

## 1. Load Trained EfficientNet (Level 3)

In [None]:
def build_efficientnet():
    model = models.efficientnet_b0(pretrained=False)
    num_ftrs = model.classifier[1].in_features
    model.classifier[1] = nn.Linear(num_ftrs, 102)
    return model

model = build_efficientnet()
model_path = None
possible_paths = [
    '../level_3/models/level_3_efficientnet.pth', 
    'models/level_3_efficientnet.pth',
    '../models/level_3_efficientnet.pth'
]

for p in possible_paths:
    if os.path.exists(p):
        model_path = p
        break

if model_path:
    print(f"Loading model from {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))
else:
    print("WARNING: Trained model not found. Using random weights for demonstration.")

model.eval()

## 2. Model Compression: Dynamic Quantization
We convert weights from Float32 to Int8. This reduces memory usage significantly.

In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, 
    {nn.Linear},  
    dtype=torch.qint8
)

print(f"Original Model Size: {os.path.getsize(model_path)/1e6:.2f} MB" if model_path else "N/A")
os.makedirs('models', exist_ok=True)
torch.save(quantized_model.state_dict(), 'models/level_5_quantized.pth')
print(f"Quantized Model Size: {os.path.getsize('models/level_5_quantized.pth')/1e6:.2f} MB")

## 3. ONNX Export
Standard exchange format for deployment.

In [None]:
dummy_input = torch.randn(1, 3, 224, 224)
onnx_path = "models/level_5_model.onnx"

torch.onnx.export(
    model,
    dummy_input,
    onnx_path,
    verbose=False,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
print(f"Model exported to {onnx_path}")

## 4. Latency Benchmarking
Testing inference speed (Target < 100ms).

In [None]:
def benchmark(model, input_tensor, name="Model", runs=100):
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_tensor)
    
    start = time.time()
    with torch.no_grad():
        for _ in range(runs):
            _ = model(input_tensor)
    end = time.time()
    
    avg_time = (end - start) / runs * 1000 # ms
    print(f"{name} Inference Time: {avg_time:.2f} ms")
    return avg_time

def benchmark_onnx(path, name="ONNX", runs=100):
    session = onnxruntime.InferenceSession(path)
    input_name = session.get_inputs()[0].name
    
    x_numpy = np.random.randn(1, 3, 224, 224).astype(np.float32)
    for _ in range(10):
        _ = session.run(None, {input_name: x_numpy})
        
    start = time.time()
    for _ in range(runs):
         _ = session.run(None, {input_name: x_numpy})
    end = time.time()
    
    avg_time = (end - start) / runs * 1000
    print(f"{name} Inference Time: {avg_time:.2f} ms")
    return avg_time

print("--- Benchmarking on CPU ---")
t_orig = benchmark(model, dummy_input, "Original PyTorch")
t_quant = benchmark(quantized_model, dummy_input, "Quantized PyTorch")
t_onnx = benchmark_onnx(onnx_path, "ONNX Runtime")

print("\n--- Results ---")
if t_onnx < 100:
    print("Success: < 100ms Inference Time achieved!")
else:
    print("Warning: > 100ms. Consider smaller model or GPU.")