In [None]:
print("Setting up system prerequisites...")

# Install system dependencies first
!apt-get update -qq
!apt-get install -y -qq build-essential python3-dev libglib2.0-0 libsm6 libxext6 libxrender-dev

print("✓ System dependencies installed\n")

In [None]:
# ============================================================================
# CELL 2: Install Core Packages
# ============================================================================
print("Installing core packages...")
print("This may take 5-8 minutes on first run.\n")

# Install PyTorch and torchvision first (if not already installed)
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Install other dependencies
!pip install -q opencv-python pillow matplotlib numpy tqdm
!pip install -q supervision

print("✓ Core packages installed\n")

In [None]:
# ============================================================================
# CELL 3: Install SAM 2
# ============================================================================
print("Installing SAM 2...")

!pip install -q git+https://github.com/facebookresearch/segment-anything-2.git

print("✓ SAM 2 installed\n")


In [None]:
# ============================================================================
# CELL 4: Install GroundingDINO (Fixed Method)
# ============================================================================
print("Installing GroundingDINO...")
print("Using alternative installation method...\n")

import os
import sys

# Method 1: Try direct install with build isolation disabled
try:
    print("Attempting Method 1: Direct install...")
    !pip install -q --no-build-isolation git+https://github.com/IDEA-Research/GroundingDINO.git
    print("✓ GroundingDINO installed successfully (Method 1)\n")
    GROUNDING_INSTALLED = True
except Exception as e:
    print(f"Method 1 failed, trying Method 2...\n")
    GROUNDING_INSTALLED = False

# Method 2: Clone and install manually
if not GROUNDING_INSTALLED:
    try:
        print("Method 2: Manual installation from source...")

        # Clone repository
        if not os.path.exists("GroundingDINO"):
            !git clone -q https://github.com/IDEA-Research/GroundingDINO.git

        os.chdir("GroundingDINO")

        # Install dependencies
        !pip install -q -r requirements.txt

        # Install package
        !pip install -q -e .

        os.chdir("..")

        print("✓ GroundingDINO installed successfully (Method 2)\n")
        GROUNDING_INSTALLED = True
    except Exception as e:
        print(f"Method 2 failed: {e}\n")
        GROUNDING_INSTALLED = False

# Method 3: Fallback to pre-built alternative (GLIP or OWLv2)
if not GROUNDING_INSTALLED:
    print("⚠ GroundingDINO installation failed.")
    print("Using fallback: OWL-ViT (Hugging Face Transformers)\n")
    !pip install -q transformers
    USE_FALLBACK = True
else:
    USE_FALLBACK = False

print("="*70)
if not USE_FALLBACK:
    print("✓ All packages installed successfully!")
else:
    print("✓ Packages installed with fallback detection model")
print("="*70 + "\n")


In [None]:
# ============================================================================
# CELL 5: Imports
# ============================================================================
import torch
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from typing import List, Tuple, Optional
from pathlib import Path
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print()

# Conditional imports based on installation success
if not USE_FALLBACK:
    try:
        from groundingdino.util.inference import load_model as load_grounding_model
        from groundingdino.util.inference import predict as grounding_predict
        from groundingdino.util.inference import annotate
        print("✓ GroundingDINO imported successfully")
    except ImportError as e:
        print(f"⚠ GroundingDINO import failed: {e}")
        print("Switching to fallback model...")
        USE_FALLBACK = True

if USE_FALLBACK:
    from transformers import OwlViTProcessor, OwlViTForObjectDetection
    print("✓ Using OWL-ViT as fallback detector")

# SAM 2 imports
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from sam2.build_sam import build_sam2_video_predictor

print("✓ SAM 2 imported successfully\n")

In [None]:
# ==============================================================
# CELL 6: Text-Driven Segmentation Pipeline
# ==============================================================

class TextDrivenSegmentation:
    def __init__(self, sam2_predictor, device, use_fallback=False):
        self.sam2_predictor = sam2_predictor
        self.device = device
        self.use_fallback = use_fallback
        if not use_fallback:
            self.grounding_model = grounding_model
        else:
            self.owl_processor = owl_processor
            self.owl_model = owl_model

    def detect_objects(self, image: np.ndarray, text_prompt: str,
                       box_threshold: float = 0.25, text_threshold: float = 0.25):
        if not self.use_fallback:
            from torchvision.ops import box_convert
            image_pil = Image.fromarray(image)
            boxes, logits, phrases = grounding_predict(
                model=self.grounding_model,
                image=image_pil,
                caption=text_prompt,
                box_threshold=box_threshold,
                text_threshold=text_threshold,
                device=self.device
            )
            h, w = image.shape[:2]
            boxes = boxes * torch.tensor([w, h, w, h], device=boxes.device)
            boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy")
            return boxes.cpu().numpy(), logits.cpu().numpy(), phrases
        else:
            image_pil = Image.fromarray(image)
            texts = [t.strip() for t in text_prompt.replace(" . ", ",").split(",")]
            inputs = self.owl_processor(text=texts, images=image_pil, return_tensors="pt")
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.owl_model(**inputs)
            target_sizes = torch.tensor([image.shape[:2]]).to(self.device)
            results = self.owl_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=box_threshold)[0]
            boxes = results["boxes"].cpu().numpy()
            labels = [texts[l] for l in results["labels"].cpu().numpy()]
            return boxes, results["scores"].cpu().numpy(), labels

    def segment_from_boxes(self, image: np.ndarray, boxes: np.ndarray):
        if len(boxes) == 0: return np.array([])
        self.sam2_predictor.set_image(image)
        masks_list = []
        for box in boxes:
            masks, _, _ = self.sam2_predictor.predict(None, None, box[None, :], multimask_output=False)
            masks_list.append(masks[0])
        return np.array(masks_list)

    def segment_from_text(self, image: np.ndarray, text_prompt: str, box_threshold=0.25, text_threshold=0.25):
        boxes, _, labels = self.detect_objects(image, text_prompt, box_threshold, text_threshold)
        if len(boxes) == 0: return np.array([]), boxes, labels
        masks = self.segment_from_boxes(image, boxes)
        return masks, boxes, labels

pipeline = TextDrivenSegmentation(sam2_predictor, device, USE_FALLBACK)


In [None]:
# ============================================================================
# CELL 7: Load Models
# ============================================================================
print("Loading models into memory...")

# First, let's check what SAM 2 configs are available
print("Checking available SAM 2 configs...")
try:
    import sam2
    sam2_path = os.path.dirname(sam2.__file__)
    configs_path = os.path.join(sam2_path, "configs")
    print(f"SAM 2 package path: {sam2_path}")

    # List available configs
    if os.path.exists(configs_path):
        print(f"Configs directory: {configs_path}")
        for root, dirs, files in os.walk(configs_path):
            for file in files:
                if file.endswith('.yaml'):
                    rel_path = os.path.relpath(os.path.join(root, file), configs_path)
                    print(f"  Found config: {rel_path}")
except Exception as e:
    print(f"Could not list configs: {e}")

print()

# Load detection model
if not USE_FALLBACK:
    print("Loading GroundingDINO...")
    try:
        # Try to find the config in the installed package
        if os.path.exists("GroundingDINO"):
            GROUNDING_CONFIG = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
        else:
            # If GroundingDINO was installed via pip, try to find config in site-packages
            import groundingdino
            package_path = os.path.dirname(groundingdino.__file__)
            GROUNDING_CONFIG = os.path.join(package_path, "config", "GroundingDINO_SwinT_OGC.py")

        grounding_model = load_grounding_model(GROUNDING_CONFIG, GROUNDING_CHECKPOINT, device=device)
        print("✓ GroundingDINO loaded")
    except Exception as e:
        print(f"⚠ Failed to load GroundingDINO: {e}")
        print("Switching to fallback...")
        USE_FALLBACK = True

if USE_FALLBACK:
    print("Loading OWL-ViT...")
    owl_processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
    owl_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
    owl_model.eval()
    print("✓ OWL-ViT loaded")

# Load SAM 2 with correct config path
print("Loading SAM 2 Image Predictor...")

# The configs are in subdirectories, and we need to use the correct format
# Based on the directory listing, we see: sam2.1/sam2.1_hiera_l.yaml
# But we need to use hydra config path format

from hydra import compose, initialize_config_dir
from hydra.core.global_hydra import GlobalHydra

# Try using hydra to load the config properly
sam2_model = None

try:
    # Method 1: Use hydra compose with the config path
    print("  Trying Method 1: Hydra compose...")

    # Get the config directory
    import sam2
    sam2_path = os.path.dirname(sam2.__file__)
    config_dir = os.path.join(sam2_path, "configs", "sam2.1")

    # Initialize hydra with the config directory
    GlobalHydra.instance().clear()
    initialize_config_dir(config_dir=config_dir, version_base=None)
    cfg = compose(config_name="sam2.1_hiera_l")

    # Now build the model using the hydra config
    from sam2.modeling.sam2_base import SAM2Base
    sam2_model = SAM2Base(
        image_encoder=cfg.image_encoder,
        memory_attention=cfg.memory_attention,
        memory_encoder=cfg.memory_encoder,
        num_maskmem=cfg.num_maskmem,
        image_size=cfg.image_size,
        backbone_stride=cfg.backbone_stride,
        sigmoid_scale_for_mem_enc=cfg.sigmoid_scale_for_mem_enc,
        sigmoid_bias_for_mem_enc=cfg.sigmoid_bias_for_mem_enc,
        binarize_mask_from_pts_for_mem_enc=cfg.binarize_mask_from_pts_for_mem_enc,
        use_mask_input_as_output_without_sam=cfg.use_mask_input_as_output_without_sam,
        max_cond_frames_in_attn=cfg.max_cond_frames_in_attn,
        directly_add_no_mem_embed=cfg.directly_add_no_mem_embed,
        use_high_res_features_in_sam=cfg.use_high_res_features_in_sam,
        multimask_output_in_sam=cfg.multimask_output_in_sam,
        multimask_min_pt_num=cfg.multimask_min_pt_num,
        multimask_max_pt_num=cfg.multimask_max_pt_num,
        multimask_output_for_tracking=cfg.multimask_output_for_tracking,
        use_multimask_token_for_obj_ptr=cfg.use_multimask_token_for_obj_ptr,
        compile_image_encoder=cfg.compile_image_encoder,
        iou_prediction_use_sigmoid=cfg.iou_prediction_use_sigmoid,
        memory_temporal_stride_for_eval=cfg.memory_temporal_stride_for_eval,
        non_overlap_masks_for_mem_enc=cfg.non_overlap_masks_for_mem_enc,
        use_obj_ptrs_in_encoder=cfg.use_obj_ptrs_in_encoder,
        max_obj_ptrs_in_encoder=cfg.max_obj_ptrs_in_encoder,
        add_all_frames_to_correct_as_cond=cfg.add_all_frames_to_correct_as_cond,
        pred_obj_scores=cfg.pred_obj_scores,
        pred_obj_scores_mlp=cfg.pred_obj_scores_mlp,
        fixed_no_obj_ptr=cfg.fixed_no_obj_ptr,
        soft_no_obj_ptr=cfg.soft_no_obj_ptr,
        use_mlp_for_obj_ptr_proj=cfg.use_mlp_for_obj_ptr_proj,
        sam_mask_decoder_extra_args=cfg.sam_mask_decoder_extra_args,
    )

    # Load checkpoint
    state_dict = torch.load(SAM2_CHECKPOINT, map_location=device)
    sam2_model.load_state_dict(state_dict, strict=False)
    sam2_model = sam2_model.to(device)
    sam2_model.eval()

    SAM2_CONFIG = "sam2.1/sam2.1_hiera_l"
    print("✓ SAM 2 loaded using Hydra compose")

except Exception as e:
    print(f"  Method 1 failed: {str(e)[:200]}")

    # Method 2: Use the checkpoint directly with the correct config path
    try:
        print("  Trying Method 2: Direct config path...")
        # Try using relative path from sam2 package
        SAM2_CONFIG = "sam2.1/sam2.1_hiera_l.yaml"

        # Need to manually specify config dir
        import sam2
        from hydra import compose, initialize
        from hydra.core.global_hydra import GlobalHydra

        GlobalHydra.instance().clear()

        # Initialize with sam2 configs directory
        with initialize(version_base=None, config_path="../../../sam2/configs"):
            cfg = compose(config_name="sam2.1/sam2.1_hiera_l")

        from sam2.build_sam import _build_sam2
        sam2_model = _build_sam2(cfg, SAM2_CHECKPOINT, device=device)
        print("✓ SAM 2 loaded using direct config path")

    except Exception as e2:
        print(f"  Method 2 failed: {str(e2)[:200]}")

        # Method 3: Use programmatic config
        print("  Trying Method 3: Programmatic loading...")
        try:
            from sam2.build_sam import build_sam2_from_config

            # Build config dict manually for sam2.1_hiera_l
            config_dict = {
                'image_encoder': {
                    '_target_': 'sam2.modeling.backbones.hieradet.Hiera',
                    'embed_dim': 144,
                    'num_heads': 2,
                    'stages': [2, 6, 36, 4],
                    'global_att_blocks': [23, 33, 43],
                    'window_pos_embed_bkg_spatial_size': [7, 7],
                },
                'memory_attention': {
                    '_target_': 'sam2.modeling.memory_attention.MemoryAttention',
                    'd_model': 256,
                    'pos_enc_at_input': True,
                    'layer': {
                        '_target_': 'sam2.modeling.memory_attention.MemoryAttentionLayer',
                        'activation': 'relu',
                        'dim_feedforward': 2048,
                        'd_model': 256,
                        'dropout': 0.1,
                        'pos_enc_at_attn': False,
                        'self_attention': {
                            '_target_': 'sam2.modeling.sam.transformer.RoPEAttention',
                            'rope_theta': 10000.0,
                            'feat_sizes': [32, 32],
                            'embedding_dim': 256,
                            'num_heads': 1,
                            'downsample_rate': 1,
                            'dropout': 0.1,
                        },
                        'd_model': 256,
                        'pos_enc_at_cross_attn_keys': True,
                        'pos_enc_at_cross_attn_queries': False,
                        'cross_attention': {
                            '_target_': 'sam2.modeling.sam.transformer.RoPEAttention',
                            'rope_theta': 10000.0,
                            'feat_sizes': [32, 32],
                            'rope_k_repeat': True,
                            'embedding_dim': 256,
                            'num_heads': 1,
                            'downsample_rate': 1,
                            'dropout': 0.1,
                            'kv_in_dim': 128,
                        },
                    },
                    'num_layers': 4,
                },
                # Add other necessary config fields
                'num_maskmem': 7,
                'image_size': 1024,
                'backbone_stride': 16,
                'use_high_res_features_in_sam': True,
                'multimask_output_in_sam': True,
            }

            # This is getting complex, let's try simpler approach
            raise NotImplementedError("Config too complex")

        except Exception as e3:
            print(f"  Method 3 failed: {str(e3)[:200]}")

if sam2_model is None:
    print("\n⚠ All automatic methods failed. Trying simple workaround...")
    print("Using build_sam2 with corrected import...")

    # Final method: patch the config search path
    import sam2
    from hydra import compose, initialize_config_dir
    from hydra.core.global_hydra import GlobalHydra

    GlobalHydra.instance().clear()

    sam2_config_dir = os.path.join(os.path.dirname(sam2.__file__), "configs")

    # Use initialize_config_dir which allows us to specify exact directory
    from hydra import initialize, compose

    with initialize(version_base=None, config_path="../../" + sam2_config_dir):
        cfg = compose(config_name="sam2.1/sam2.1_hiera_l.yaml")

    sam2_model = build_sam2(cfg, SAM2_CHECKPOINT, device=device)
    SAM2_CONFIG = "sam2.1/sam2.1_hiera_l.yaml"
    print("✓ SAM 2 loaded with workaround")

sam2_predictor = SAM2ImagePredictor(sam2_model)

print("Loading SAM 2 Video Predictor...")
sam2_video_predictor = build_sam2_video_predictor(SAM2_CONFIG, SAM2_CHECKPOINT, device=device)
print("✓ SAM 2 Video Predictor loaded")

print("\n✓ All models loaded!\n")


In [None]:
# ============================================================================
# CELL 7: Load Models
# ============================================================================
print("Loading models into memory...")

# Load detection model
if not USE_FALLBACK:
    print("Loading GroundingDINO...")
    try:
        grounding_model = load_grounding_model(GROUNDING_CONFIG, GROUNDING_CHECKPOINT, device=device)
        print("✓ GroundingDINO loaded")
    except Exception as e:
        print(f"⚠ Failed to load GroundingDINO: {e}")
        print("Switching to fallback...")
        USE_FALLBACK = True

if USE_FALLBACK:
    print("Loading OWL-ViT...")
    owl_processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
    owl_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
    owl_model.eval()
    print("✓ OWL-ViT loaded")

# Load SAM 2
print("Loading SAM 2 Image Predictor...")
sam2_model = build_sam2(SAM2_CONFIG, SAM2_CHECKPOINT, device=device)
sam2_predictor = SAM2ImagePredictor(sam2_model)
print("✓ SAM 2 Image Predictor loaded")

print("Loading SAM 2 Video Predictor...")
sam2_video_predictor = build_sam2_video_predictor(SAM2_CONFIG, SAM2_CHECKPOINT, device=device)
print("✓ SAM 2 Video Predictor loaded")

print("\n✓ All models loaded!\n")

In [None]:
# ============================================================================
# CELL 9: Example 1 - Dog Segmentation
# ============================================================================
print("="*70)
print("EXAMPLE 1: Segmenting a Dog")
print("="*70 + "\n")

!wget -q -O sample_dog.jpg https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=800

image = cv2.imread("sample_dog.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

text_prompt = "dog"
masks, boxes, labels = pipeline.segment_from_text(image, text_prompt, box_threshold=0.2)

visualize_results(image, masks, boxes, labels, text_prompt)

In [None]:
# CELL 10: Example 2 - Multiple Objects
# ============================================================================
print("\n" + "="*70)
print("EXAMPLE 2: Multiple Objects")
print("="*70 + "\n")

!wget -q -O sample_scene.jpg https://images.unsplash.com/photo-1555041469-a586c61ea9bc?w=800

image2 = cv2.imread("sample_scene.jpg")
image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)

# Try multiple objects
if not USE_FALLBACK:
    text_prompt2 = "sofa . table . lamp"  # GroundingDINO format
else:
    text_prompt2 = "sofa, table, lamp"  # OWL-ViT format

masks2, boxes2, labels2 = pipeline.segment_from_text(image2, text_prompt2, box_threshold=0.15)

visualize_results(image2, masks2, boxes2, labels2, text_prompt2)

In [None]:
# ============================================================================
# CELL 11: Interactive Upload
# ============================================================================
print("\n" + "="*70)
print("INTERACTIVE: Upload Your Own Image")
print("="*70 + "\n")

from google.colab import files

print("Upload an image:")
uploaded = files.upload()

if uploaded:
    filename = list(uploaded.keys())[0]
    user_image = cv2.imread(filename)
    user_image = cv2.cvtColor(user_image, cv2.COLOR_BGR2RGB)

    print("\nImage loaded!")
    print("\nEnter text prompt:")
    if not USE_FALLBACK:
        print("  Format: 'object1 . object2 . object3'")
        print("  Example: 'cat . dog . person'")
    else:
        print("  Format: 'object1, object2, object3'")
        print("  Example: 'cat, dog, person'")

    user_prompt = input("\nPrompt: ")

    print(f"\nSegmenting '{user_prompt}'...")
    user_masks, user_boxes, user_labels = pipeline.segment_from_text(
        user_image, user_prompt, box_threshold=0.15
    )

    visualize_results(user_image, user_masks, user_boxes, user_labels, user_prompt)


In [None]:
# CELL 12: BONUS - Video Segmentation
# ============================================================================
print("\n" + "="*70)
print("BONUS: VIDEO OBJECT SEGMENTATION")
print("="*70 + "\n")

print("Downloading sample video...")
!wget -q -O sample_video.mp4 "https://github.com/facebookresearch/segment-anything-2/raw/main/notebooks/videos/bedroom.mp4"
print("✓ Video downloaded\n")

def extract_frames(video_path: str, output_dir: str = "video_frames") -> List[str]:
    """Extract frames from video"""
    os.makedirs(output_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    frame_paths = []
    frame_idx = 0

    pbar = tqdm(desc="Extracting frames")
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_path = f"{output_dir}/frame_{frame_idx:05d}.jpg"
        cv2.imwrite(frame_path, frame)
        frame_paths.append(frame_path)
        frame_idx += 1
        pbar.update(1)

    cap.release()
    pbar.close()
    return frame_paths

print("Extracting frames...")
frame_paths = extract_frames("sample_video.mp4")
print(f"✓ Extracted {len(frame_paths)} frames\n")

# Detect in first frame
first_frame = cv2.imread(frame_paths[0])
first_frame_rgb = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)

video_prompt = "bed"  # Adjust based on video
print(f"Detecting '{video_prompt}' in first frame...")

boxes, _, labels = pipeline.detect_objects(first_frame_rgb, video_prompt, box_threshold=0.15)

if len(boxes) > 0:
    print(f"✓ Found {len(boxes)} object(s): {labels}\n")

    print("Propagating mask across video...")

    inference_state = sam2_video_predictor.init_state(video_path="video_frames")
    sam2_video_predictor.reset_state(inference_state)

    _, out_obj_ids, out_mask_logits = sam2_video_predictor.add_new_points_or_box(
        inference_state=inference_state,
        frame_idx=0,
        obj_id=1,
        box=boxes[0],
    )

    video_segments = {}
    for out_frame_idx, out_obj_ids, out_mask_logits in sam2_video_predictor.propagate_in_video(inference_state):
        video_segments[out_frame_idx] = {
            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
            for i, out_obj_id in enumerate(out_obj_ids)
        }

    print(f"✓ Segmented {len(video_segments)} frames\n")

    # Visualize
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()

    sample_indices = np.linspace(0, len(frame_paths)-1, 8, dtype=int)

    for idx, frame_idx in enumerate(sample_indices):
        frame = cv2.imread(frame_paths[frame_idx])
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        if frame_idx in video_segments:
            mask = video_segments[frame_idx][1][0]
            overlay = frame.copy()
            overlay[mask] = overlay[mask] * 0.5 + np.array([0, 255, 0]) * 0.5

            contours, _ = cv2.findContours(mask.astype(np.uint8),
                                          cv2.RETR_EXTERNAL,
                                          cv2.CHAIN_APPROX_SIMPLE)
            cv2.drawContours(overlay, contours, -1, (255, 255, 0), 2)
            axes[idx].imshow(overlay)
        else:
            axes[idx].imshow(frame)

        axes[idx].set_title(f"Frame {frame_idx}", fontsize=12, fontweight='bold')
        axes[idx].axis('off')

    plt.suptitle(f"Video Segmentation: '{video_prompt}'",
                fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

    print("\n✓ Video segmentation complete!")

else:
    print("⚠ No objects detected in first frame. Try different prompt.")


In [None]:
# ============================================================================
# CELL 13: Summary
# ============================================================================
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
if not USE_FALLBACK:
    print("✓ Using GroundingDINO + SAM 2")
else:
    print("✓ Using OWL-ViT (fallback) + SAM 2")
print("✓ Text-driven image segmentation working")
print("✓ Video segmentation with temporal propagation")
print("\nTIPS:")
print("- Be specific in prompts: 'red car' > 'car'")
print("- Lower box_threshold (0.1-0.2) if detecting too few objects")
print("- Raise box_threshold (0.3-0.4) if too many false positives")
print("="*70)