In [None]:
import sys; sys.path.append("../src")
sys.path.append("src")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import sys; sys.path.append("../src", "../../src")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import json
from functools import partial
from pathlib import Path
from PIL import Image

import numpy
import copy
import time

import torch
import torch.nn as nn
import torchvision
from torch.utils.data import ConcatDataset
from torch.utils.data import Dataset, random_split, DataLoader
import torchvision.transforms as T

import glob
from contextlib import redirect_stdout
from pathlib import Path
print(torch.cuda.is_available())

In [None]:
# MODELS - https://github.com/pytorch/vision/tree/main/torchvision/models/detection
from torchvision.models.detection import (
    fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights,
    fasterrcnn_mobilenet_v3_large_fpn, FasterRCNN_MobileNet_V3_Large_FPN_Weights,
    fasterrcnn_mobilenet_v3_large_320_fpn, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights,
    ssd300_vgg16, SSD300_VGG16_Weights,
    ssdlite320_mobilenet_v3_large, SSDLite320_MobileNet_V3_Large_Weights,
    retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights,
    fcos_resnet50_fpn, FCOS_ResNet50_FPN_Weights
)
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.retinanet import RetinaNetClassificationHead
from torchvision.models.detection.fcos import FCOSHead

In [None]:
from src.quant.fp16 import load_calib_images, norms_to_fp32, linears_to_half, run_quant_infer

In [None]:
data_dir = '../data/images_v1_v2'
coco_path = '../data/annotations_v1_v2/coco_v1_v2.json'

# 20% annotations
aug_perc = 0.2
sample_coco_path = f"../data/annotations_v1_v2/coco_v1_v2_{aug_perc}.json"

image_size= (256, 256)  # (128, 128) (256, 256) (512, 512)
batch_size = 2
val_percent = 0.1
num_classes =  2  # for just 1 object, classes will be 2 as background should be added as well

# ---------------- Optim/training hyperparams -----------------------------------
num_epochs = 12
learning_rate = 0.001
momentum = 0.9
weight_decay = 1e-4
print_freq = 100

# ---------------- Quant hyperparams ----------------------------------------------
calib_images_dir = "../data/images_v1_v2"  # <-- change to your path

In [None]:
# Your paths / hyperparams
model_path = Path("../models/frcnn_mobilenet/frcnn_mobilenet_epoch10.pth")
num_classes = 2  # <-- set to your dataset (background + N objects)

print(f"\033[91m{model_path}\033[0m")

# Build base model
model = fasterrcnn_mobilenet_v3_large_fpn(weights=None)  # we load your trained head next
in_feat = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_feat, num_classes)

# Load weights
state = torch.load(kd_horz_clrjtr_rot_model_path, map_location="cpu")
model.load_state_dict(state, strict=True)
model.eval()

In [None]:
calib_imgs = load_calib_images(calib_images_dir, max_imgs=128)
print(f"Loaded {len(calib_imgs)} images for calibration.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[FP16] device: {device}")

#### Option1
backbone converted to half; norms kept in float32

In [None]:
# Work on a copy so your original stays float32
float_model = model.eval().to(device)
fp16_model = copy.deepcopy(float_model).eval().to(device)

# Convert the heavy conv backbone to half precision (weights + activations)
fp16_model.backbone.body.half()

norms_to_fp32(fp16_model.backbone.body)
print("FP16: backbone converted to half; norms kept in float32.")

#### Option2
backbone + Linear in half, norms in float32.

In [None]:
fp16_model = copy.deepcopy(model).eval().to(device)

norms_to_fp32(fp16_model.backbone.body)

linears_to_half(fp16_model.roi_heads)
linears_to_half(fp16_model.rpn.head)

print("FP16 ready: backbone + Linear in half, norms in float32.")

In [None]:
# Inference uses autocast
with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
    _ = fp16_model([calib_imgs[0].to(device)])

In [None]:
test_imgs = calib_imgs[:8] if len(calib_imgs) else [torch.rand(3,640,640) for _ in range(8)]

baseline_model = copy.deepcopy(model).eval().to(device)
t_float = run_quant_infer(baseline_model, test_imgs)
t_fp16 = run_quant_infer(fp16_model, test_imgs)

print(f"Avg latency — float32: {t_float:.3f}s/img")
print(f"Avg latency — fp16:    {t_fp16:.3f}s/img")

# Structural sanity check
with torch.inference_mode():
    img0 = test_imgs[0].to(device)
    with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=(device.type=='cuda')):
        o_float = baseline_model([img0])[0]
        o_fp16  = fp16_model([img0])[0]

for k in ["boxes","scores","labels"]:
    print(k, o_float[k].shape, "->", o_fp16[k].shape)

In [None]:
save_dir = Path("../models/frcnn_mobilenet_fp16")
save_dir.mkdir(parents=True, exist_ok=True)

scripted = torch.jit.script(fp16_model.cpu().eval())  # script in CPU for portability
torch.jit.save(scripted, save_dir / "model_scripted_fp16.pt")
print("Saved TorchScript FP16 model:", save_dir / "model_scripted_fp16.pt")

# State dict preserves FP16 weights in the converted submodules
torch.save(fp16_model.state_dict(), save_dir / "model_fp16_state_dict.pth")
print("Saved FP16 state_dict:", save_dir / "model_fp16_state_dict.pth")