In [None]:
import torch
from ultralytics import YOLO
from models.yolo_vit import HybridKeypointNet
from models.utils import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# yolo vit
yolo11 = YOLO('yolo11l-seg.pt')  # Or yolo11m-seg.pt, yolo11x-seg.pt, etc.
backbone_seq = yolo11.model.model[:12]
backbone = YoloBackbone(backbone_seq, selected_indices=[0,1,2,3,4,5,6,7,8,9,10,11])
input_dummy = torch.randn(1, 3, 128, 128)
with torch.no_grad():
    feats = backbone(input_dummy)
in_channels_list = [f.shape[1] for f in feats]
keypoint_net = HybridKeypointNet(backbone, in_channels_list)
model = keypoint_net
for param in model.backbone.parameters():
    param.requires_grad = False
# for param in model.diffusion.vit.parameters():
#     param.requires_grad = False
model = model.to(device)
compiled_model = torch.compile(model)
compiled_model.load_state_dict(torch.load('models/keypoint_model_vit.pth', map_location=device))

In [None]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt

img_files = os.listdir("results-bed-images")
for img_file in img_files:
    if img_file.endswith('.jpg'):
        image_path = os.path.join("results-bed-images", img_file)
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img / 255
        img = cv2.resize(img, (128, 128))  # Resize to match model
        images = torch.Tensor([np.transpose(img, (2, 0, 1))]).to(device)
        outputs = compiled_model(images)
        coords = soft_argmax(outputs)

        # render the predicted keypoints on the image
        for img, kp in zip(images.cpu().numpy(), coords.cpu().detach().numpy()):
            img = np.transpose(img, (1, 2, 0))
            # Convert RGB to BGR for OpenCV
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) * 255
            for i in range(kp.shape[0]):
                cv2.circle(img, (int(kp[i][0]), int(kp[i][1])), 1, (0,0,255), -1)
        cv2.imwrite(os.path.join("predicted-bed-keypoints", img_file), img)