In [None]:
import sys; sys.path.append("../src")
sys.path.append("src")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import sys; sys.path.append("../src", "../../src")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import json
from functools import partial
from pathlib import Path
from PIL import Image

import numpy
import copy
import time

import torch, re
import torch.nn as nn
import torchvision
from torch.utils.data import ConcatDataset
from torch.utils.data import Dataset, random_split, DataLoader
import torchvision.transforms as T

from packaging.version import Version

import glob
from contextlib import redirect_stdout
from pathlib import Path
print(torch.cuda.is_available())

In [None]:
# MODELS - https://github.com/pytorch/vision/tree/main/torchvision/models/detection
from torchvision.models.detection import (
    fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights,
    fasterrcnn_mobilenet_v3_large_fpn, FasterRCNN_MobileNet_V3_Large_FPN_Weights,
    fasterrcnn_mobilenet_v3_large_320_fpn, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights,
    ssd300_vgg16, SSD300_VGG16_Weights,
    ssdlite320_mobilenet_v3_large, SSDLite320_MobileNet_V3_Large_Weights,
    retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights,
    fcos_resnet50_fpn, FCOS_ResNet50_FPN_Weights
)
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.retinanet import RetinaNetClassificationHead
from torchvision.models.detection.fcos import FCOSHead

In [None]:
from src.quant.fp16 import load_calib_images


In [None]:
data_dir = '../data/images_v1_v2'
coco_path = '../data/annotations_v1_v2/coco_v1_v2.json'

# 20% annotations
aug_perc = 0.2
sample_coco_path = f"../data/annotations_v1_v2/coco_v1_v2_{aug_perc}.json"

image_size= (256, 256)  # (128, 128) (256, 256) (512, 512)
batch_size = 2
val_percent = 0.1
num_classes =  2  # for just 1 object, classes will be 2 as background should be added as well

# ---------------- Optim/training hyperparams -----------------------------------
num_epochs = 12
learning_rate = 0.001
momentum = 0.9
weight_decay = 1e-4
print_freq = 100

# ---------------- Quant hyperparams ----------------------------------------------
calib_images_dir = "../data/images_v1_v2"  # <-- change to your path

In [None]:
# Your paths / hyperparams
model_path = Path("../models/frcnn_mobilenet/frcnn_mobilenet_epoch10.pth")
num_classes = 2  # <-- set to your dataset (background + N objects)

print(f"\033[91m{model_path}\033[0m")

# Build base model
model = fasterrcnn_mobilenet_v3_large_fpn(weights=None)  # we load your trained head next
in_feat = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_feat, num_classes)

# Load weights
state = torch.load(kd_horz_clrjtr_rot_model_path, map_location="cpu")
model.load_state_dict(state, strict=True)
model.eval()

In [None]:
calib_imgs = load_calib_images(calib_images_dir, max_imgs=128)
print(f"Loaded {len(calib_imgs)} images for calibration.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[FP16] device: {device}")

In [None]:
TORCH_VER = Version(re.sub(r'\+.*$', '', torch.__version__))  # strip local build tags

# 1) Import locations
prepare_fx = convert_fx = prepare_qat_fx = None
quantize_dynamic = None

# FX prepare/convert moved around across versions:
# - 1.8–1.12: torch.quantization.quantize_fx
# - 1.13–2.0: torch.ao.quantization.quantize_fx
# - 2.1+:     torch.ao.quantization.fx
try:
    from torch.ao.quantization.fx import prepare_fx, convert_fx, prepare_qat_fx
except Exception:
    try:
        from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, prepare_qat_fx
    except Exception:
        from torch.quantization.quantize_fx import prepare_fx, convert_fx, prepare_qat_fx  # old path

# Dynamic quant moved too:
try:
    from torch.ao.quantization import quantize_dynamic
except Exception:
    from torch.quantization import quantize_dynamic

# 2) qconfig / qconfig_mapping differences
# - Torch >= 2.1: get_default_qconfig_mapping(backend)
# - Torch <= 2.0: use get_default_qconfig(backend) and pass a qconfig_dict={"": qconfig}
qconfig_mapping = None
qconfig_dict = None

backend = "qnnpack" if torch.backends.quantized.engine in (None, "", "qnnpack") else torch.backends.quantized.engine
# Prefer qnnpack on most CPUs; you can set "fbgemm" on AVX2/AVX512 Linux servers:
torch.backends.quantized.engine = "qnnpack"

try:
    from torch.ao.quantization import get_default_qconfig_mapping
    qconfig_mapping = get_default_qconfig_mapping(torch.backends.quantized.engine)
except Exception:
    try:
        # Older API
        from torch.quantization import get_default_qconfig
    except Exception:
        from torch.ao.quantization import get_default_qconfig
    qconfig = get_default_qconfig(torch.backends.quantized.engine)
    qconfig_dict = {"": qconfig}

def fx_prepare(module, example_inputs):
    """
    Version-agnostic wrapper: returns 'prepared' module for PTQ or QAT depending on which
    prepare_* you call outside.
    """
    if qconfig_mapping is not None:
        return prepare_fx(module, qconfig_mapping, example_inputs=(example_inputs,))
    else:
        # older API: prepare_fx(module, qconfig_dict, *), not mapping
        return prepare_fx(module, qconfig_dict, example_inputs=(example_inputs,))

def fx_convert(prepared):
    return convert_fx(prepared)

print(f"[Quant FX Compat] torch=={torch.__version__} | engine={torch.backends.quantized.engine} | "
      f"uses {'qconfig_mapping' if qconfig_mapping is not None else 'qconfig_dict'}")


In [None]:
# 2.1 Extract the body we want to quantize (leave FPN + heads float)
float_model = model  # original float model (eval)
float_body = float_model.backbone.body     # MobileNetV3 features (Conv/BN/ReLU stacks)

# 2.2 Build a wrapper to pass images through exactly like the backbone would
class BodyWrapper(nn.Module):
    def __init__(self, body: nn.Module):
        super().__init__()
        self.body = body

    def forward(self, x):
        # x: tensor of shape [N, 3, H, W] with float32 in [0,1]
        return self.body(x)

# Work on a *copy* so we don’t mutate your original model unless we succeed
float_body_copy = copy.deepcopy(float_body).eval()

# 2.3 Prepare FX graph for static PTQ
# qconfig_mapping = get_default_qconfig_mapping("qnnpack")
example_batch = calib_imgs[0].unsqueeze(0) if len(calib_imgs) else torch.rand(1,3,640,640)  # fallback fake size

# prepared = prepare_fx(float_body_copy, qconfig_mapping, example_inputs=(example_batch,))
prepared = fx_prepare(float_body_copy, example_batch)
prepared.eval()


# 2.4 Calibration: run representative images through prepared module
with torch.inference_mode():
    for img in calib_imgs:
        prepared(img.unsqueeze(0))

# 2.5 Convert to quantized module
# quantized_body = convert_fx(prepared)
quantized_body = fx_convert(prepared)

# 2.6 Reattach to the original model
quantized_model = copy.deepcopy(float_model).eval()
quantized_model.backbone.body = quantized_body
quantized_model.eval()

print("Static/PTQ: quantized backbone attached.")


In [None]:
quantized_model.roi_heads.box_head     = quantize_dynamic(quantized_model.roi_heads.box_head, {nn.Linear}, dtype=torch.qint8)
quantized_model.roi_heads.box_predictor = quantize_dynamic(quantized_model.roi_heads.box_predictor, {nn.Linear}, dtype=torch.qint8)
quantized_model.rpn.head.cls_logits    = quantize_dynamic(quantized_model.rpn.head.cls_logits, {nn.Linear}, dtype=torch.qint8)
quantized_model.rpn.head.bbox_pred     = quantize_dynamic(quantized_model.rpn.head.bbox_pred,  {nn.Linear}, dtype=torch.qint8)

print("Dynamic: quantized ROI + RPN linear layers.")


In [None]:
def run_infer(m, imgs, warmup=3, iters=10):
    m.eval()
    with torch.inference_mode():
        # warmup
        for _ in range(warmup):
            _ = m([imgs[0]])
        # time
        t0 = time.time()
        for i in range(min(iters, len(imgs))):
            _ = m([imgs[i]])
        t1 = time.time()
    return (t1 - t0)/min(iters, len(imgs))

test_imgs = calib_imgs[:8] if len(calib_imgs) else [torch.rand(3,640,640) for _ in range(8)]

# Baseline float (make a fresh copy so it’s not quantized)
baseline_model = copy.deepcopy(model).eval()

t_float = run_infer(baseline_model, test_imgs)
t_quant = run_infer(quantized_model, test_imgs)

print(f"Avg CPU latency — float:  {t_float:.3f}s/img")
print(f"Avg CPU latency — quant:  {t_quant:.3f}s/img")

# Quick structural check on one image
with torch.inference_mode():
    o_float = baseline_model([test_imgs[0]])[0]
    o_quant = quantized_model([test_imgs[0]])[0]

for k in ["boxes","scores","labels"]:
    print(k, o_float[k].shape, "->", o_quant[k].shape)


In [None]:
save_dir = Path("../models/frcnn_mobilenet_quantized")
save_dir.mkdir(parents=True, exist_ok=True)

# 5a. Try TorchScript (best for CPU-only inference deployment)
scripted = torch.jit.script(quantized_model)
torch.jit.save(scripted, save_dir / "model_scripted_quant_1.pt")
print("Saved TorchScript quantized model:", save_dir / "model_scripted_quant.pt")

torch.save(quantized_model.state_dict(), save_dir / "model_quantized_state_dict_1.pth")
print("Saved quantized state_dict:", save_dir / "model_quantized_state_dict.pth")