In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
from sam3.sam3.model_builder import build_sam3_image_model
from sam3.sam3.model.sam3_image_processor import Sam3Processor

In [None]:
model = build_sam3_image_model()
processor = Sam3Processor(model)

In [None]:
def to_numpy_mask(mask, frame_shape):
    H, W = frame_shape[:2]
    if isinstance(mask, torch.Tensor):
        m = mask.detach().cpu().numpy()
    else:
        m = np.asarray(mask)
    m = np.squeeze(m).astype(np.float32)
    if m.shape != (H, W):
        m = cv2.resize(m, (W, H), interpolation=cv2.INTER_LINEAR)
    return m / (m.max() + 1e-6)

In [None]:
def to_numpy_mask(mask, frame_shape):
    H, W = frame_shape[:2]
    if isinstance(mask, torch.Tensor):
        m = mask.detach().cpu().numpy()
    else:
        m = np.asarray(mask)
    m = np.squeeze(m).astype(np.float32)
    if m.shape != (H, W):
        m = cv2.resize(m, (W, H), interpolation=cv2.INTER_LINEAR)
    return m / (m.max() + 1e-6)

In [None]:
def visualize_mask_soft(frame_bgr, masks, color=(0, 255, 0), alpha=0.45, blur_k=21):
   
    H, W, _ = frame_bgr.shape

    combined = np.zeros((H, W), dtype=np.float32)

    for mask in masks:
        if isinstance(mask, torch.Tensor):
            m = mask.detach().cpu().numpy()
        else:
            m = np.asarray(mask)
        m = np.squeeze(m)

        if m.shape != (H, W):
            m = cv2.resize(m.astype(np.float32), (W, H))

        combined = np.maximum(combined, m)  

    combined = combined.astype(np.float32)
    combined_blur = cv2.GaussianBlur(combined, (blur_k, blur_k), 0)

    overlay = np.zeros_like(frame_bgr, dtype=np.uint8)
    overlay[:, :, 0] = color[0]
    overlay[:, :, 1] = color[1]
    overlay[:, :, 2] = color[2]

    combined_blur_3ch = np.stack([combined_blur]*3, axis=-1)
    output = frame_bgr * (1 - combined_blur_3ch * alpha) + overlay * (combined_blur_3ch * alpha)

    return output.astype(np.uint8)


In [None]:
VIDEO_PATH = "input_video_path"
OUTPUT_PATH = "output_video_path"
TEXT_PROMPT = "Object Prompts separated by Comma"

In [None]:
cap = cv2.VideoCapture(VIDEO_PATH)

if not cap.isOpened():
    raise Exception("Could not open video file: " + VIDEO_PATH)

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, (width, height))

In [None]:
while True:
    ret, frame_bgr = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(frame_rgb)

    state = processor.set_image(pil_img)
    output = processor.set_text_prompt(state=state, prompt=TEXT_PROMPT)

    masks = output["masks"]  

    frame_bgr = visualize_mask_soft(frame_bgr, masks, alpha=0.4)

    out.write(frame_bgr)

cap.release()
out.release()
cv2.destroyAllWindows()

print("Done! Saved:", OUTPUT_PATH)