In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from dynamic_sam2.sam2_video_tracker import Sam2VideoTracker
from dynamic_sam2.object_detection import DinoDetectionModel

In [2]:
%%time
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

dino_model = DinoDetectionModel(
    device="cuda",
    box_threshold=0.30,
    text_threshold=0.25
)

tracker = Sam2VideoTracker(
    video_path="../assets/bedroom.mp4",
    text_prompt="person <and> pillow <and> bed",
    detection_model=dino_model,
    output_dir="tracking_results",
    frames_dir="temp_frames",
    check_interval=25,
    device="cuda",
    target_fps=5,
    target_resolution=(1280, 720),
    save_masks=True
)

obj = tracker.process_video()



final text_encoder_type: bert-base-uncased




Checkpoint path: /home/ubuntu/DynamicSAM2/checkpoints/sam2.1_hiera_large.pt


INFO:Sam2VideoTracker:
=== Starting Video Processing ===
DEBUG:Sam2VideoTracker:Prepared 25 frames for chunk 0-24
INFO:Sam2VideoTracker:Processing chunk: current_frame = 0, chunk_end = 24, frames in chunk = 25
DEBUG:Sam2VideoTracker:Creating masks for 6 boxes
Falling back to all available kernels for scaled_dot_product_attention (which may have a slower speed).
frame loading (JPEG): 100%|██████████| 25/25 [00:00<00:00, 35.05it/s]
propagate in video: 100%|██████████| 25/25 [00:14<00:00,  1.69it/s]
INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 5
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 5
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [  0 392 156 711]
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [268  24 576 660]
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [704 438 902 605]
INFO:Sam2VideoTracker:Active Track 4: ID 4, Box [566 476 714 589]
INFO:Sam2VideoTracker:Active Track 5: ID 5, Box [  0 544 929 7

CPU times: user 36.5 s, sys: 3.29 s, total: 39.8 s
Wall time: 31 s


In [4]:
obj.keys()

dict_keys([1, 2, 3, 4, 5])

In [6]:
obj[1]['frames']

{0: [195, 180, 389, 539],
 1: [231, 179, 402, 504],
 2: [251, 325, 414, 589],
 3: [237, 224, 392, 560],
 4: [233, 156, 392, 512],
 5: [261, 295, 400, 583],
 6: [253, 256, 417, 579],
 7: [286, 189, 424, 508],
 8: [335, 366, 483, 558],
 9: [353, 325, 448, 560],
 10: [338, 235, 507, 497],
 11: [297, 365, 522, 548],
 12: [335, 415, 493, 570],
 13: [343, 292, 459, 510],
 14: [222, 309, 435, 500],
 15: [252, 427, 418, 573],
 16: [173, 209, 428, 553],
 17: [120, 212, 402, 502],
 18: [137, 367, 341, 632],
 19: [91, 253, 318, 589],
 20: [0, 219, 230, 550],
 21: [0, 358, 221, 667],
 22: [0, 237, 262, 608],
 23: [0, 218, 173, 610],
 24: [0, 392, 156, 711],
 25: [0, 314, 164, 690],
 26: [0, 67, 216, 656],
 27: [22, 0, 217, 600],
 28: [0, 114, 234, 705],
 29: [55, 0, 255, 650],
 30: [65, 0, 282, 565],
 31: [0, 62, 303, 682],
 32: [76, 74, 272, 697],
 33: [132.0, 0.0, 349.0, 567.0]}

In [8]:
obj[1]['class']

'person <'