In [5]:
from lib.utils.learning import load_backbone
from lib.utils.tools import get_config
import os
import subprocess
from lib.model.model_action import ActionNet

def getActionNet(yaml):
    args = get_config(yaml)
    model_backbone = load_backbone(args)
    # if args.finetune:
    #     chk_filename = os.path.join(opts.pretrained, opts.selection)
    #     print('Loading backbone', chk_filename)
    #     checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)['model_pos']
    #     model_backbone = load_pretrained_weights(model_backbone, checkpoint)
    # if args.partial_train:
    #     model_backbone = partial_train_layers(model_backbone, args.partial_train)
    return   ActionNet(backbone=model_backbone, dim_rep=args.dim_rep, num_classes=args.action_classes, dropout_ratio=args.dropout_ratio, version=args.model_version, hidden_dim=args.hidden_dim, num_joints=args.num_joints)


def get_gpu_memory_usage():
    """Returns GPU memory usage in GB (first device only)."""
    result = subprocess.check_output(
        ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader']
    )
    memory_used = int(result.decode().split('\n')[0])
    return memory_used / 1024  # Convert MB to GB

In [19]:
import torch
import time
import cv2
import numpy as np
from ultralytics import YOLO

# Load model (adjust path if needed)
model = YOLO("yolo11n-pose.pt")
model.eval()

# Load dummy image (or use webcam frame)
img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float() / 255.0  # Shape: [1, 3, 640, 640]

model_action = getActionNet('./configs/action/MB_ft_NTU60_xsub_lite_temp_comp.yaml').cuda()
model_action.eval()

input_tensor = torch.randn(1, 2, 243, 17, 3).cuda()
# Warm-up
with torch.no_grad():
    for _ in range(10):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)

# Measure FPS
num_iters = 100
torch.cuda.synchronize()
start_time = time.time()

with torch.no_grad():
    for _ in range(num_iters):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)
        torch.cuda.synchronize()

end_time = time.time()
elapsed = end_time - start_time
fps = num_iters / elapsed

print(f"YOLOv11 Pose Estimator FPS: {fps:.2f}")

True

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 14.9ms
Speed: 3.0ms preprocess, 14.9ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 17.0ms
Speed: 2.9ms preprocess, 17.0ms inference, 2.8ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 15.7ms
Speed: 6.1ms preprocess, 15.7ms inference, 2.3ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 14.4ms
Speed: 3.2ms preprocess, 14.4ms inference, 3.6ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 14.5ms
Speed: 3.0ms preprocess, 14.5ms inference, 11.1ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 16.2ms
Speed: 2.9ms preprocess, 16.2ms inference, 3.0ms postprocess per image at sha

In [23]:
import torch
import time
import cv2
import numpy as np
from ultralytics import YOLO

# Load model (adjust path if needed)
model = YOLO("yolo11s-pose.pt").cuda()
model.eval()

# Load dummy image (or use webcam frame)
img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float() / 255.0  # Shape: [1, 3, 640, 640]

model_action = getActionNet('./configs/action/MB_ft_NTU60_xsub_lite_temp_comp.yaml').cuda()
model_action.eval()

input_tensor = torch.randn(1, 2, 243, 17, 3).cuda()
# Warm-up
with torch.no_grad():
    for _ in range(10):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)

# Measure FPS
num_iters = 100
torch.cuda.synchronize()
start_time = time.time()

with torch.no_grad():
    for _ in range(num_iters):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)
        torch.cuda.synchronize()

end_time = time.time()
elapsed = end_time - start_time
fps = num_iters / elapsed

print(f"YOLOv11 Pose Estimator FPS: {fps:.2f}")

True

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 12.8ms
Speed: 2.9ms preprocess, 12.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 12.3ms
Speed: 2.5ms preprocess, 12.3ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 17.7ms
Speed: 4.6ms preprocess, 17.7ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 14.1ms
Speed: 2.4ms preprocess, 14.1ms inference, 2.8ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 17.6ms
Speed: 2.2ms preprocess, 17.6ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 34.3ms
Speed: 3.0ms preprocess, 34.3ms inference, 7.6ms postprocess per image at shap

In [24]:
import torch
import time
import cv2
import numpy as np
from ultralytics import YOLO

# Load model (adjust path if needed)
model = YOLO("yolo11n-pose.pt").cuda()
model.eval()

# Load dummy image (or use webcam frame)
img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float() / 255.0  # Shape: [1, 3, 640, 640]

model_action = getActionNet('./configs/action/MB_ft_NTU60_xsub_lite_temp_comp.yaml').cuda()
model_action.eval()

input_tensor = torch.randn(1, 2, 243, 17, 3).cuda()
# Warm-up
with torch.no_grad():
    for _ in range(10):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)

# Measure FPS
num_iters = 100
torch.cuda.synchronize()
start_time = time.time()

with torch.no_grad():
    for _ in range(num_iters):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)
        torch.cuda.synchronize()

end_time = time.time()
elapsed = end_time - start_time
fps = num_iters / elapsed

print(f"YOLOv11 Pose Estimator FPS: {fps:.2f}")

True

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 13.6ms
Speed: 2.0ms preprocess, 13.6ms inference, 4.5ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 13.4ms
Speed: 2.5ms preprocess, 13.4ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 16.2ms
Speed: 1.8ms preprocess, 16.2ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 23.3ms
Speed: 2.9ms preprocess, 23.3ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 12.6ms
Speed: 1.8ms preprocess, 12.6ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 12.9ms
Speed: 1.8ms preprocess, 12.9ms inference, 2.7ms postprocess per image at shap

In [27]:
import torch
import time
import cv2
import numpy as np
from ultralytics import YOLO

# Load model (adjust path if needed)
model = YOLO("yolo11m-pose.pt").cuda()
model.eval()

# Load dummy image (or use webcam frame)
img = np.random.randint(0, 255, (640, 640, 3), dtype=np.uint8)
img_tensor = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).float() / 255.0  # Shape: [1, 3, 640, 640]

model_action = getActionNet('./configs/action/MB_ft_NTU60_xsub_lite_temp_comp.yaml').cuda()
model_action.eval()

input_tensor = torch.randn(1, 2, 243, 17, 3).cuda()
# Warm-up
with torch.no_grad():
    for _ in range(10):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)

# Measure FPS
num_iters = 100
torch.cuda.synchronize()
start_time = time.time()

with torch.no_grad():
    for _ in range(num_iters):
        _ = model('bus.jpg')
        _ = model_action(input_tensor)
        torch.cuda.synchronize()

end_time = time.time()
elapsed = end_time - start_time
fps = num_iters / elapsed

print(f"YOLOv11 Pose Estimator FPS: {fps:.2f}")

True

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 39.4ms
Speed: 2.7ms preprocess, 39.4ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 17.2ms
Speed: 3.1ms preprocess, 17.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 19.3ms
Speed: 2.5ms preprocess, 19.3ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 15.7ms
Speed: 2.4ms preprocess, 15.7ms inference, 2.7ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 16.7ms
Speed: 1.9ms preprocess, 16.7ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /home/jovianto/MotionBERT/bus.jpg: 640x480 4 persons, 12.2ms
Speed: 2.0ms preprocess, 12.2ms inference, 2.2ms postprocess per image at shap