In [8]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from dynamic_sam2.sam2_video_tracker import Sam2VideoTracker
from dynamic_sam2.object_detection import DinoDetectionModel

In [18]:
%%time
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

dino_model = DinoDetectionModel(
    text_prompt="person, pillow, bed, dresser, shelf",
    device="cuda",
    box_threshold=0.30,
    text_threshold=0.25
)

tracker = Sam2VideoTracker(
    video_path="../assets/bedroom.mp4",
    detection_model=dino_model,
    output_dir="tracking_results",
    frames_dir="temp_frames",
    check_interval=5,
    device="cuda",
    target_fps=5,
    target_resolution=(1280, 720),
    save_masks=True
)

obj = tracker.process_video()

final text_encoder_type: bert-base-uncased


INFO:Sam2VideoTracker:Loading SAM2 models...


Checkpoint path: /home/ubuntu/DynamicSAM2/checkpoints/sam2.1_hiera_large.pt


INFO:Sam2VideoTracker:
=== Starting Video Processing ===
DEBUG:Sam2VideoTracker:Prepared 5 frames for chunk 0-4
INFO:Sam2VideoTracker:Processing chunk: current_frame = 0, chunk_end = 4, frames in chunk = 5
DEBUG:Sam2VideoTracker:Creating masks for 8 boxes
frame loading (JPEG): 100%|██████████| 5/5 [00:00<00:00, 34.89it/s]
propagate in video: 100%|██████████| 5/5 [00:02<00:00,  2.21it/s]
INFO:Sam2VideoTracker:
=== Starting Detection Merge ===
INFO:Sam2VideoTracker:Active SAM2 tracks after filtering: 7
INFO:Sam2VideoTracker:Filtered SAM2 tracks remaining: 7
INFO:Sam2VideoTracker:Active Track 1: ID 1, Box [228 134 631 610], Confidence: 0.385
INFO:Sam2VideoTracker:Active Track 2: ID 2, Box [233 156 392 512], Confidence: 0.408
INFO:Sam2VideoTracker:Active Track 3: ID 3, Box [681 401 880 568], Confidence: 0.394
INFO:Sam2VideoTracker:Active Track 4: ID 4, Box [  0 499 910 719], Confidence: 0.320
INFO:Sam2VideoTracker:Active Track 5: ID 5, Box [555 439 694 552], Confidence: 0.339
INFO:Sam2Vide

CPU times: user 44.8 s, sys: 1.86 s, total: 46.7 s
Wall time: 32.1 s


In [10]:
obj.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8])

In [11]:
obj[1].keys()

dict_keys(['frames', 'class', 'confidence', 'masks'])

In [12]:
obj[1]['frames']

{0: [415, 0, 686, 528],
 1: [389, 119, 649, 635],
 2: [304, 0, 604, 538],
 3: [221, 58, 637, 602],
 4: [229, 135, 631, 610],
 5: [244, 71, 649, 591],
 6: [255, 123, 643, 635],
 7: [269, 19, 664, 581],
 8: [309, 24, 606, 615],
 9: [318, 123, 619, 640],
 10: [237, 5, 603, 578],
 11: [277, 91, 544, 637],
 12: [291, 53, 516, 621],
 13: [244, 0, 500, 595],
 14: [189, 120, 428, 698],
 15: [79, 10, 441, 617],
 16: [54, 11, 393, 568],
 17: [73, 157, 404, 670],
 18: [53, 0, 389, 589],
 19: [62, 0, 409, 592],
 20: [144, 168, 466, 710],
 21: [196, 0, 525, 642],
 22: [250, 12, 508, 687],
 23: [280, 169, 495, 719],
 24: [269, 24, 575, 659],
 25: [339, 65, 677, 660],
 26: [347, 142, 686, 691],
 27: [375, 10, 692, 594],
 28: [456, 102, 661, 648],
 29: [457, 109, 649, 642],
 30: [418, 22, 678, 490],
 31: [446, 127, 735, 643],
 32: [371, 93, 626, 642],
 33: [391, 62, 632, 480]}

In [13]:
obj[1]['class']

'person'

In [17]:
obj[2]['confidence']

{0: 0.4075421094894409,
 1: 0.4075421094894409,
 2: 0.4075421094894409,
 3: 0.4075421094894409,
 4: 0.4075421094894409,
 5: 0.4075421094894409,
 6: 0.4075421094894409,
 7: 0.4075421094894409,
 8: 0.4075421094894409,
 9: 0.4075421094894409,
 10: 0.4075421094894409,
 11: 0.4075421094894409,
 12: 0.4075421094894409,
 13: 0.4075421094894409,
 14: 0.4075421094894409,
 15: 0.4075421094894409,
 16: 0.4075421094894409,
 17: 0.4075421094894409,
 18: 0.4075421094894409,
 19: 0.4075421094894409,
 20: 0.4075421094894409,
 21: 0.4075421094894409,
 22: 0.4075421094894409,
 23: 0.4075421094894409,
 24: 0.4075421094894409,
 25: 0.4075421094894409,
 26: 0.4075421094894409,
 27: 0.4075421094894409,
 28: 0.4075421094894409,
 29: 0.4075421094894409,
 30: 0.4075421094894409,
 31: 0.4075421094894409,
 32: 0.4075421094894409,
 33: 0.4075421094894409}