In [4]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from dynamic_sam2.sam2_video_tracker import Sam2VideoTracker
from dynamic_sam2.object_detection import YOLODetectionModel

In [5]:
%%time
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

yolo_detector = YOLODetectionModel(
    model_path="yolo11x.pt",
    device="cuda",
    conf_threshold=0.3,
    iou_threshold=0.45
)

tracker = Sam2VideoTracker(
    video_path="../assets/bedroom.mp4",
    detection_model=yolo_detector,
    output_dir="tracking_results",
    frames_dir="temp_frames",
    check_interval=5,
    device="cuda",
    target_fps=5,
    target_resolution=(1280, 720),
    save_masks=False
)

obj = tracker.process_video()

INFO:Sam2VideoTracker:Loading SAM2 models...


Checkpoint path: /home/ubuntu/DynamicSAM2/checkpoints/sam2.1_hiera_large.pt


INFO:Sam2VideoTracker:
=== Starting Video Processing ===
DEBUG:Sam2VideoTracker:Prepared 5 frames for chunk 0-4
INFO:Sam2VideoTracker:Processing chunk: current_frame = 0, chunk_end = 4, frames in chunk = 5



image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_0_4/00000.jpg: 384x640 2 persons, 1 bed, 14.9ms
Speed: 1.0ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)


DEBUG:Sam2VideoTracker:Creating masks for 3 boxes
frame loading (JPEG): 100%|██████████| 5/5 [00:00<00:00, 33.93it/s]
propagate in video: 100%|██████████| 5/5 [00:01<00:00,  3.47it/s]


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_0_4/00004.jpg: 384x640 1 person, 1 bed, 14.9ms
Speed: 1.1ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [228 134 631 610], Confidence: 0.932
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [233 156 392 512], Confidence: 0.912
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 501 934 719], Confidence: 0.863
INFO:Sam2VideoTracker:DINO new detections: 2
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     224.62      132.52       633.2      615.95], Confidence: 0.930
INFO:Sam2VideoTracker:New Detection 2: Label bed, Box [     1.7064      384.73         946      711.18], Confidence: 0.777
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.932 -> 0.930
INFO:Sam2VideoTracker:Updated confidence for tracked object 3: 0.863 -> 0.777
INFO:Sam2VideoTracker:=== Merge Summary ===
INFO:Sam2VideoTracker:Initial active tracked objects: 3
INFO:Sam2VideoTracker:


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_4_8/00008.jpg: 384x640 1 person, 1 bed, 14.9ms
Speed: 1.1ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [309  24 606 616], Confidence: 0.930
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [335 369 483 558], Confidence: 0.912
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 505 927 719], Confidence: 0.777
INFO:Sam2VideoTracker:DINO new detections: 2
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     302.03      22.252      608.18      619.61], Confidence: 0.934
INFO:Sam2VideoTracker:New Detection 2: Label bed, Box [     1.0859      395.45      936.19      711.89], Confidence: 0.719
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.930 -> 0.934
INFO:Sam2VideoTracker:Updated confidence for tracked object 3: 0.777 -> 0.719
INFO:Sam2VideoTracker:=== Merge Summary ===
INFO:Sam2VideoTracker:Initial active tracked objects: 3
INFO:Sam2VideoTracker:


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_8_12/00012.jpg: 384x640 1 person, 1 bed, 14.9ms
Speed: 1.1ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [291  53 517 622], Confidence: 0.934
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [335 415 493 570], Confidence: 0.912
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 520 932 719], Confidence: 0.719
INFO:Sam2VideoTracker:DINO new detections: 2
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     284.42       52.88      519.89      627.94], Confidence: 0.931
INFO:Sam2VideoTracker:New Detection 2: Label bed, Box [     1.5168      396.25      941.68      711.88], Confidence: 0.837
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.934 -> 0.931
INFO:Sam2VideoTracker:Updated confidence for tracked object 3: 0.719 -> 0.837
INFO:Sam2VideoTracker:=== Merge Summary ===
INFO:Sam2VideoTracker:Initial active tracked objects: 3
INFO:Sam2VideoTracker:


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_12_16/00016.jpg: 384x640 1 person, 1 bed, 15.0ms
Speed: 1.1ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [ 53  10 393 568], Confidence: 0.931
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [173 208 429 552], Confidence: 0.912
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 510 916 719], Confidence: 0.837
INFO:Sam2VideoTracker:DINO new detections: 2
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     51.353      14.969       393.9       574.5], Confidence: 0.933
INFO:Sam2VideoTracker:New Detection 2: Label bed, Box [     1.4018      401.31       921.8      712.07], Confidence: 0.789
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.931 -> 0.933
INFO:Sam2VideoTracker:Updated confidence for tracked object 3: 0.837 -> 0.789
INFO:Sam2VideoTracker:=== Merge Summary ===
INFO:Sam2VideoTracker:Initial active tracked objects: 3
INFO:Sam2VideoTracker:


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_16_20/00020.jpg: 384x640 2 persons, 1 bed, 14.9ms
Speed: 1.1ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [144 168 467 711], Confidence: 0.933
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [  0 219 229 550], Confidence: 0.912
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 550 908 719], Confidence: 0.789
INFO:Sam2VideoTracker:DINO new detections: 3
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     142.11      162.71      469.77      712.07], Confidence: 0.934
INFO:Sam2VideoTracker:New Detection 2: Label person, Box [    0.78101      217.89      229.71      557.86], Confidence: 0.899
INFO:Sam2VideoTracker:New Detection 3: Label bed, Box [    0.48041      425.95       916.3       711.9], Confidence: 0.731
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.933 -> 0.934
INFO:Sam2VideoTracker:Updated confidence for tracked object 2: 0.912 -> 0.


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_20_24/00024.jpg: 384x640 2 persons, 1 bed, 15.0ms
Speed: 1.1ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [269  24 576 659], Confidence: 0.934
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [  0 392 156 711], Confidence: 0.899
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 542 960 719], Confidence: 0.731
INFO:Sam2VideoTracker:DINO new detections: 3
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     266.14      25.259      577.34      669.05], Confidence: 0.938
INFO:Sam2VideoTracker:New Detection 2: Label person, Box [    0.26927      391.66      134.06      710.62], Confidence: 0.933
INFO:Sam2VideoTracker:New Detection 3: Label bed, Box [   0.042236      437.03      974.18      710.98], Confidence: 0.809
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.934 -> 0.938
INFO:Sam2VideoTracker:Updated confidence for tracked object 2: 0.899 -> 0.


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_24_28/00028.jpg: 384x640 2 persons, 1 bed, 14.9ms
Speed: 1.1ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [456 102 662 648], Confidence: 0.938
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [  0 113 234 705], Confidence: 0.933
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [   0  537 1018  719], Confidence: 0.809
INFO:Sam2VideoTracker:DINO new detections: 3
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [    0.25798      113.25      236.32      707.85], Confidence: 0.940
INFO:Sam2VideoTracker:New Detection 2: Label person, Box [     451.45       99.11      666.29      655.89], Confidence: 0.921
INFO:Sam2VideoTracker:New Detection 3: Label bed, Box [     1.6567      432.35      1024.2      711.79], Confidence: 0.909
INFO:Sam2VideoTracker:Updated confidence for tracked object 2: 0.933 -> 0.940
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.938 -


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_28_32/00032.jpg: 384x640 2 persons, 1 bed, 14.9ms
Speed: 1.1ms preprocess, 14.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [371  93 627 642], Confidence: 0.921
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [ 76  74 272 697], Confidence: 0.940
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [   0  550 1005  719], Confidence: 0.909
INFO:Sam2VideoTracker:DINO new detections: 3
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     365.91      94.164      632.93      651.15], Confidence: 0.931
INFO:Sam2VideoTracker:New Detection 2: Label bed, Box [     1.7675      444.53      1010.8      712.82], Confidence: 0.912
INFO:Sam2VideoTracker:New Detection 3: Label person, Box [     73.732      73.103      277.13      700.29], Confidence: 0.903
INFO:Sam2VideoTracker:Updated confidence for tracked object 1: 0.921 -> 0.931
INFO:Sam2VideoTracker:Updated confidence for tracked object 3: 0.909 -


image 1/1 /home/ubuntu/DynamicSAM2/examples/temp_frames/chunk_32_33/00033.jpg: 384x640 2 persons, 1 bed, 14.8ms
Speed: 1.1ms preprocess, 14.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)



INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 3
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 3
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [391  62 632 480], Confidence: 0.931
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [132   0 349 566], Confidence: 0.903
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [  0 559 999 719], Confidence: 0.912
INFO:Sam2VideoTracker:DINO new detections: 3
INFO:Sam2VideoTracker:New Detection 1: Label person, Box [     129.93           0      351.88      570.17], Confidence: 0.931
INFO:Sam2VideoTracker:New Detection 2: Label bed, Box [     1.7151       453.3      1004.5      712.64], Confidence: 0.911
INFO:Sam2VideoTracker:New Detection 3: Label person, Box [     389.05      63.989      636.86      485.42], Confidence: 0.860
INFO:Sam2VideoTracker:Updated confidence for tracked object 2: 0.903 -> 0.931
INFO:Sam2VideoTracker:Updated confidence for tracked object 3: 0.912 -> 0.

CPU times: user 29.1 s, sys: 1.5 s, total: 30.6 s
Wall time: 22.1 s


In [7]:
obj

{1: {'frames': {0: [415, 0, 686, 528],
   1: [389, 119, 649, 635],
   2: [304, 0, 604, 538],
   3: [221, 58, 637, 602],
   4: [229, 135, 631, 610],
   5: [244, 71, 649, 591],
   6: [255, 123, 643, 635],
   7: [269, 19, 664, 581],
   8: [309, 24, 606, 615],
   9: [318, 123, 619, 640],
   10: [237, 5, 603, 578],
   11: [277, 91, 544, 637],
   12: [291, 53, 516, 621],
   13: [244, 0, 500, 595],
   14: [189, 120, 428, 698],
   15: [79, 10, 441, 617],
   16: [54, 11, 393, 568],
   17: [73, 157, 404, 670],
   18: [53, 0, 389, 589],
   19: [62, 0, 409, 592],
   20: [144, 168, 466, 710],
   21: [196, 0, 525, 642],
   22: [250, 12, 508, 687],
   23: [280, 169, 495, 719],
   24: [269, 24, 575, 659],
   25: [339, 65, 677, 660],
   26: [347, 142, 686, 691],
   27: [375, 10, 692, 594],
   28: [456, 102, 661, 648],
   29: [457, 109, 649, 642],
   30: [418, 22, 678, 490],
   31: [446, 127, 735, 643],
   32: [371, 93, 626, 642],
   33: [391, 62, 632, 480]},
  'class': 'person',
  'confidence': {0: 0.9