In [None]:
import time
import psutil
from IPython.display import display, Javascript

# Initialize tracking variables
if 'start_time' not in globals():
    start_time = time.time()
    initial_ram = psutil.virtual_memory().used / (1024 ** 3)  # GB
    print("Tracking started for all cells...")

In [None]:
import torch

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

from ultralytics import SAM
sam = SAM("sam2.1_b.pt").to(DEVICE)
print("SAM 2.1 model loaded on", DEVICE)

In [None]:
#Helper
import numpy as np
import matplotlib.pyplot as plt
def show_mask(mask, ax, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) #alpha=0.6 makes the mask semi-transparent.
    else:
        color = np.array([30/255, 144/255, 255/255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels==1]
    neg_points = coords[labels==0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)

def show_anns(anns):
    if len(anns) == 0:
        return
    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
    ax = plt.gca()
    ax.set_autoscale_on(False)

    for ann in sorted_anns:
        m = ann['segmentation']
        img = np.ones((m.shape[0], m.shape[1], 3))
        color_mask = np.random.random((1, 3)).tolist()[0]
        for i in range(3):
            img[:,:,i] = color_mask[i]
        ax.imshow(np.dstack((img, m*0.35)))

In [None]:
print("SAM 2.1 model loaded on", DEVICE)

In [None]:
# Cell 5: Load your image
import cv2
image_path = "/content/1_v0Bm-HQxWtpbQ0Yq463uqw.jpg"
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('off')
plt.title("Original Image")
plt.show()

In [None]:
# SAM 2.1: Direct mask generation with correct output handling
results = sam(image)  # image is an RGB numpy array
result = results[0]   # get first result (single image)

masks_tensor = result.masks.data  # torch tensor (num_masks, H, W)
masks = masks_tensor.cpu().numpy().astype(bool)  # convert to numpy boolean masks

filtered_masks = [m for m in masks]  # Add filtering if needed

print(f"Total masks generated: {len(masks)}")
print(f"Number of high-confidence masks: {len(filtered_masks)}")


In [None]:
from ultralytics import YOLO
yolo = YOLO('yolov8m-world.pt').to(DEVICE)
# Set your desired zero-shot detection classes
yolo.set_classes(['person', 'car', 'bicycle', 'dog', 'cat', 'bus', 'truck',"objects"])
print("YOLO-World model loaded with custom zero-shot classes.")


In [None]:
# Object detection function (using YOLO-World zero-shot classes)
def detect_objects(image, score_threshold=0.1):
    results = yolo.predict(image, conf=score_threshold, verbose=False)

    detections = []
    for result in results:
        for box in result.boxes:
            xyxy = box.xyxy.cpu().numpy()[0].tolist()
            conf = box.conf.cpu().numpy()[0]
            cls_id = int(box.cls.cpu().numpy()[0])
            # YOLO-World: class names are set via set_classes()
            label = yolo.model.names[cls_id] if hasattr(yolo.model, 'names') else str(cls_id)

            detections.append({
                'bbox': [int(coord) for coord in xyxy],  # [x1,y1,x2,y2]
                'label': label,
                'score': float(conf)
            })

    return sorted(detections, key=lambda x: x['score'], reverse=True)  # Highest score first


In [None]:
# Cell 13: Load and display input image
image_path = "/content/1_v0Bm-HQxWtpbQ0Yq463uqw.jpg"
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('off')
plt.title("Input Image")
plt.show()


In [None]:
detections = detect_objects(image, score_threshold=0.15)

print(f"Detected {len(detections)} objects:")
for det in detections:
    print(f"- {det['label']} (confidence: {det['score']:.2f}): {det['bbox']}")


In [None]:
vis_image = image.copy()

for det in detections:
    x1, y1, x2, y2 = det['bbox']
    cv2.rectangle(vis_image, (x1, y1), (x2, y2), (255, 0, 0), 3)
    label = f"{det['label']} ({det['score']:.2f})"
    cv2.putText(vis_image, label, (x1, y1-10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)

plt.figure(figsize=(15, 10))
plt.imshow(vis_image)
plt.axis('off')
plt.title("Detected Objects with YOLO-World")
plt.show()


In [None]:
import json
import cv2

with open("detections.json", "w") as f:
    json.dump(detections, f, indent=2)

cv2.imwrite("detected_objects.jpg", cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
print("Results saved to detections.json and detected_objects.jpg")


In [None]:
def calculate_iou(mask, bbox):
    """Compute Intersection-over-Union between mask and bbox"""
    x1, y1, x2, y2 = map(int, bbox)
    bbox_mask = np.zeros(mask.shape[:2], dtype=np.uint8)
    cv2.rectangle(bbox_mask, (x1, y1), (x2, y2), 255, -1)
    mask_bool = mask > 0
    bbox_bool = bbox_mask > 0
    intersection = np.logical_and(mask_bool, bbox_bool).sum()
    union = np.logical_or(mask_bool, bbox_bool).sum()
    return intersection / max(union, 1e-6)

# For SAM 2.1, masks are already binary numpy arrays
binary_masks = [m.astype(np.uint8) for m in filtered_masks]

object_groups = []
for det in detections:
    group = {
        'label': det['label'],
        'bbox': [int(x) for x in det['bbox']],
        'masks': []
    }
    for mask in binary_masks:
        iou = calculate_iou(mask, group['bbox'])
        if iou > 0.5:
            group['masks'].append(mask)
    object_groups.append(group)


In [None]:
def mask_to_bbox(mask):
    """Convert binary mask to bounding box coordinates [x1,y1,x2,y2]"""
    contours, _ = cv2.findContours(mask.astype(np.uint8),
                 cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return None
    x, y, w, h = cv2.boundingRect(contours[0])
    return [x, y, x+w, y+h]

final_objects = []

for group in object_groups:
    if not group['masks']:
        continue

    merged = np.zeros_like(binary_masks[0], dtype=np.uint8)
    for mask in group['masks']:
        merged = cv2.bitwise_or(merged, mask.astype(np.uint8))
    bbox_coords = mask_to_bbox(merged > 0)
    if bbox_coords is None:
        continue

    x1, y1, x2, y2 = bbox_coords
    h, w = image.shape[:2]
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(w, x2), min(h, y2)

    if x2 <= x1 or y2 <= y1:
        continue

    cropped = image[y1:y2, x1:x2]
    masked_crop = cv2.bitwise_and(cropped, cropped, mask=merged[y1:y2, x1:x2])

    final_objects.append({
        'label': group['label'],
        'bbox': [x1, y1, x2, y2],
        'merged_mask': merged[y1:y2, x1:x2],
        'cropped_image': masked_crop
    })

print(f"Successfully processed {len(final_objects)} objects")


In [None]:
for i, obj in enumerate(final_objects):
    plt.figure(figsize=(4, 4))
    plt.title(f"{obj['label']} - Object {i+1}")
    plt.imshow(obj['cropped_image'])
    plt.axis('off')
    plt.show()


In [None]:
import os
save_dir = "cropped_objects"
os.makedirs(save_dir, exist_ok=True)

for i, obj in enumerate(final_objects):
    cropped_img = obj['cropped_image']
    cropped_img_rgb = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB)
    filename = f"{obj['label']}_object_{i+1}.png"
    filepath = os.path.join(save_dir, filename)
    cv2.imwrite(filepath, cropped_img)
    print(f"Saved {filepath}")


In [None]:
from dam.describe_anything_model import DescribeAnythingModel


In [None]:
from PIL import Image
import torch
model = DescribeAnythingModel(
    model_path="nvidia/DAM-3B",
    conv_mode="v1",
    prompt_mode="full+crop",
)


In [None]:
image = Image.open("/content/1_v0Bm-HQxWtpbQ0Yq463uqw.jpg").convert("RGB")


In [None]:
import numpy as np
import cv2
from PIL import Image

def numpy_to_pil_mask(np_img):
    if len(np_img.shape) == 3 and np_img.shape[2] == 3:
        np_img = cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(np_img).convert("L")
    else:
        pil_img = Image.fromarray(np_img).convert("L")
    return pil_img

def embed_mask_in_full_image(mask_crop, bbox, full_shape):
    full_mask = np.zeros(full_shape, dtype=np.uint8)
    x1, y1, x2, y2 = bbox
    resized_mask = cv2.resize(mask_crop, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)
    full_mask[y1:y2, x1:x2] = resized_mask
    return full_mask

descriptions = []

if isinstance(image, Image.Image):
    image_np = np.array(image)  # RGB
    image = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

image_height, image_width = image.shape[:2]

for i, obj in enumerate(final_objects):
    bbox = obj['bbox']
    mask_crop = obj['merged_mask']  # smaller mask cropped to bbox size or close
    full_mask = embed_mask_in_full_image(mask_crop, bbox, (image_height, image_width))
    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    mask_pil = numpy_to_pil_mask(full_mask)

    query = "<image> Describe the object in the highlighted region in detail."

    description = model.get_description(
        image_pil=image_pil,
        mask_pil=mask_pil,
        query=query,
        temperature=0.2,
        top_p=0.9,
        num_beams=1,
        max_new_tokens=512,
    )

    print(f"Description for object {i+1} ({obj['label']}):\n{description}\n")
    descriptions.append(description)


In [None]:
# Calculate total metrics
total_time = time.time() - start_time
final_ram = psutil.virtual_memory().used / (1024 ** 3)
ram_used = final_ram - initial_ram

print("\n" + "="*50)
print(f"TOTAL PROCESSING TIME FOR ALL CELLS: {total_time:.2f} seconds")
print(f"TOTAL RAM CONSUMPTION: {ram_used:.2f} GB")
print("="*50)

# Show GPU summary
!nvidia-smi


In [None]:

import nbformat
def clean_notebook(input_path, output_path=None):
    if output_path is None:
        output_path = input_path
    nb = nbformat.read(input_path, as_version=4)
    if 'widgets' in nb.get('metadata', {}):
        del nb['metadata']['widgets']
    nbformat.write(nb, output_path)

clean_notebook("SAM2.1.ipynb")