### Importing all necessary libraries

In [None]:
import os
import glob
import shutil
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pycocotools import mask as cocomask
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from collections import defaultdict
import torch
from sam2.build_sam import build_sam2
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
from sam2.sam2_image_predictor import SAM2ImagePredictor
from sam2.build_sam import build_sam2_video_predictor

### Initializing variables for SAM2 model setup

In [None]:
checkpoint = r"C:\Users\dell\Desktop\Assignment\sam2_hiera_tiny.pt"
model_cfg = r"C:\Users\dell\Desktop\Assignment\sam2\configs\sam2\sam2_hiera_t.yaml"

predictor_prompt = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint, device='cpu'))
sam2 = build_sam2(model_cfg, checkpoint, device='cpu', apply_postprocessing=False)
mask_generator = SAM2AutomaticMaskGenerator(sam2)
predictor_vid = build_sam2_video_predictor(model_cfg, checkpoint, device='cpu')

### Creating temporary directory for processing

In [None]:
tempfolder = "./tempdir"

def create_if_not_exists(dirname):
    if not os.path.exists(dirname):
        os.mkdir(dirname)

def cleardir(tempfolder):
    filepaths = glob.glob(tempfolder+"/*")
    for filepath in filepaths:
        os.unlink(filepath)

In [None]:
def show_mask(mask, ax, obj_id=None, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        cmap = plt.get_cmap("tab10")
        cmap_idx = 0 if obj_id is None else obj_id
        color = np.array([*cmap(cmap_idx)[:3], 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)

### track_item_boxes for tracking objects between two images using SAM2

In [None]:
def track_item_boxes(imgpath1, imgpath2, img1boxclasslist, visualize=True):
    create_if_not_exists(tempfolder)
    cleardir(tempfolder)
    shutil.copy(imgpath1, os.path.join(tempfolder, "00000.jpg"))
    shutil.copy(imgpath2, os.path.join(tempfolder, "00001.jpg"))
    
    inference_state = predictor_vid.init_state(video_path="./tempdir")
    predictor_vid.reset_state(inference_state)
    ann_frame_idx = 0
    
    for img1boxclass in img1boxclasslist:
        ([xmin, xmax, ymin, ymax], objectnumint) = img1boxclass
        box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
        _, out_obj_ids, out_mask_logits = predictor_vid.add_new_points_or_box(
            inference_state=inference_state,
            frame_idx=ann_frame_idx,
            obj_id=objectnumint,
            box=box,
        )
    
    video_segments = {}
    for out_frame_idx, out_obj_ids, out_mask_logits in predictor_vid.propagate_in_video(inference_state):
        print(f"\nDebug: Frame {out_frame_idx} has {len(out_obj_ids)} objects")  # Debug print
        print(f"Object IDs: {out_obj_ids}")  # Debug print
        
        video_segments[out_frame_idx] = {
            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
            for i, out_obj_id in enumerate(out_obj_ids)
        }
    
    if visualize:
        fig, ax = plt.subplots()
        plt.title(f"original image object ::")
        ax.imshow(Image.open(os.path.join(tempfolder, "00000.jpg")))
        rect = patches.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, 
                               linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
        plt.show()
        
        out_frame_idx = 1
        plt.figure(figsize=(6, 4))
        plt.title(f"detected object in test image ::")
        plt.imshow(Image.open(os.path.join(tempfolder, "00001.jpg")))
        for out_obj_id, out_mask in video_segments[out_frame_idx].items():
            show_mask(out_mask, plt.gca(), obj_id=out_obj_id)
        plt.show()
    
    return video_segments

### extract_category_from_filename and group_files_by_category for extracting category names and grouping them

In [None]:
def extract_category_from_filename(filename):
    """Extract category name from filename"""
    return '_'.join(os.path.basename(filename).split('_')[:-1])

def group_files_by_category(data_dir):
    """Group image and mask files by their product category"""
    image_files = glob.glob(os.path.join(data_dir, "*.jpg"))
    mask_files = glob.glob(os.path.join(data_dir, "*_gt.png"))
    
    category_dict = defaultdict(lambda: {'images': [], 'masks': []})
    
    for img in image_files:
        category = extract_category_from_filename(img)
        category_dict[category]['images'].append(img)
    
    for mask in mask_files:
        base_name = '_'.join(os.path.basename(mask).split('_')[:-2])
        category = extract_category_from_filename(base_name + '.jpg')
        
        matching_images = [img for img in category_dict[category]['images'] 
                         if base_name in img]
        
        for img in matching_images:
            category_dict[category]['masks'].append((img, mask))
    
    return category_dict

### process_img_png_mask for extracting bounding box coordinates from mask

In [None]:
def process_img_png_mask(img_path, mask_path, visualize=False):
    try:
        img = Image.open(img_path)
        mask = Image.open(mask_path)
        mask_array = np.array(mask)
        
        if mask_array.size == 0:
            print(f"Warning: Empty mask in {mask_path}")
            return 0, 0, 0, 0
            
        rows = np.any(mask_array, axis=1)
        cols = np.any(mask_array, axis=0)
        
        ymin, ymax = np.where(rows)[0][[0, -1]]
        xmin, xmax = np.where(cols)[0][[0, -1]]
        
        if visualize:
            fig, ax = plt.subplots(1, 2, figsize=(10, 5))
            ax[0].imshow(img)
            ax[0].set_title("Original Image")
            ax[1].imshow(mask_array)
            rect = plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, 
                               linewidth=1, edgecolor='r', facecolor='none')
            ax[1].add_patch(rect)
            ax[1].set_title("Mask with Bounding Box")
            plt.show()
        
        return xmin, xmax, ymin, ymax
        
    except Exception as e:
        print(f"Error processing {img_path}: {str(e)}")
        return 0, 0, 0, 0

### mask_to_bbox for converting binary mask to COCO-style bounding box [x,y,width,height]

In [None]:
def mask_to_bbox(mask):
    if len(mask.shape) == 3:
        mask = mask[0] if mask.shape[0] == 1 else mask
    
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)
    
    if not np.any(rows) or not np.any(cols):
        return [0, 0, 0, 0]  # Return empty bbox if no True values
    
    ymin, ymax = np.where(rows)[0][[0, -1]]
    xmin, xmax = np.where(cols)[0][[0, -1]]
    
    return [int(xmin), int(ymin), int(xmax-xmin), int(ymax-ymin)]

### evaluate_product_category for evaluating performance on each product category

In [None]:
def evaluate_product_category(category_name, image_mask_pairs):
    print(f"\nEvaluating {category_name}...")
    
    if not image_mask_pairs or len(image_mask_pairs) < 2:
        print("Error: Need at least 2 image-mask pairs for evaluation")
        return None
    
    first_img, first_mask = image_mask_pairs[0]
    print(f"Using reference image: {os.path.basename(first_img)}")
    print(f"Using reference mask: {os.path.basename(first_mask)}")
    
    xmin, xmax, ymin, ymax = process_img_png_mask(first_img, first_mask)
    print(f"Reference bounding box: x=[{xmin},{xmax}], y=[{ymin},{ymax}]")
    
    coco_gt = {"images": [], "annotations": [], "categories": [{"id": 1, "name": "product"}]}
    coco_dt = []
    
    for i, (img_path, mask_path) in enumerate(image_mask_pairs[1:]):
        print(f"\nProcessing image {i+1}/{len(image_mask_pairs)-1}: {os.path.basename(img_path)}")
        
        try:
            gt_mask = np.array(Image.open(mask_path))
            gt_bbox = mask_to_bbox(gt_mask)
            print(f"Ground truth bbox: {gt_bbox}")
            
            image_id = i+1
            coco_gt["images"].append({"id": image_id, "file_name": img_path})
            
            rle = cocomask.encode(np.asfortranarray(gt_mask))
            rle['counts'] = rle['counts'].decode('ascii')
            
            coco_gt["annotations"].append({
                "id": image_id,
                "image_id": image_id,
                "category_id": 1,
                "bbox": gt_bbox,
                "area": int(gt_bbox[2] * gt_bbox[3]),
                "iscrowd": 0,
                "segmentation": rle
            })
            
            video_segments = track_item_boxes(first_img, img_path, [([xmin, xmax, ymin, ymax], 1)], False)
            
            if not video_segments or 1 not in video_segments:
                print("Warning: No valid segments found in frame 1")
                continue
                
            pred_mask = video_segments[1][1]  # Frame 1, Object ID 1
            
            if len(pred_mask.shape) == 3:
                pred_mask = pred_mask[0] if pred_mask.shape[0] == 1 else pred_mask
            
            pred_bbox = mask_to_bbox(pred_mask)
            print(f"Predicted bbox: {pred_bbox}")
            
            
            if pred_bbox == [0, 0, 0, 0]:  # Skip if prediction is empty
                print("Warning: Empty prediction - skipping")
                continue
                
            pred_rle = cocomask.encode(np.asfortranarray(pred_mask.astype(np.uint8)))
            pred_rle['counts'] = pred_rle['counts'].decode('ascii')
            
            coco_dt.append({
                "image_id": image_id,
                "category_id": 1,
                "bbox": pred_bbox,
                "score": 1.0,
                "segmentation": pred_rle
            })
            
        except Exception as e:
            print(f"Error processing {img_path}: {str(e)}")
            continue
    
    if not coco_dt:
        print("Error: No valid predictions generated!")
        return None
    
    try:
        coco_gt_obj = COCO()
        coco_gt_obj.dataset = coco_gt
        coco_gt_obj.createIndex()
        
        coco_dt_obj = coco_gt_obj.loadRes(coco_dt)
        coco_eval = COCOeval(coco_gt_obj, coco_dt_obj, 'bbox')
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()
        
        return coco_eval.stats
        
    except Exception as e:
        print(f"Error in COCO evaluation: {str(e)}")
        return None

In [None]:
def main():
    data_dir = r"C:\Users\dell\Desktop\Assignment\CMU10_3D\data_2D"
    
    category_dict = group_files_by_category(data_dir)
    
    results = {}
    for category, files in category_dict.items():
        image_mask_pairs = []
        for img in files['images']:
            base_name = os.path.splitext(os.path.basename(img))[0]
            masks = [mask for (img_path, mask) in files['masks'] 
                    if base_name in os.path.basename(mask)]
            
            if masks:
                image_mask_pairs.append((img, masks[0])) 
        
        if len(image_mask_pairs) < 2:
            print(f"Skipping {category} - needs at least 2 images with masks")
            continue
            
        stats = evaluate_product_category(category, image_mask_pairs)
        
        if stats is not None:
            results[category] = {
                "AP": stats[0],  # AP @ IoU=0.50:0.95
                "AP50": stats[1],  # AP @ IoU=0.50
                "AP75": stats[2],  # AP @ IoU=0.75
                "AR": stats[8]     # AR @ maxDets=100
            }
    
    print("\n=== Final Results ===")
    for product, metrics in results.items():
        print(f"\n{product}:")
        print(f"  AP: {metrics['AP']:.3f}")
        print(f"  AP50: {metrics['AP50']:.3f}")
        print(f"  AP75: {metrics['AP75']:.3f}")
        print(f"  AR: {metrics['AR']:.3f}")

if __name__ == "__main__":
    main()


Evaluating can_chowder...
Using reference image: can_chowder_000001.jpg
Using reference mask: can_chowder_000001_1_gt.png
Reference bounding box: x=[371,464], y=[150,290]

Processing image 1/49: can_chowder_000002.jpg
Ground truth bbox: [249, 148, 87, 130]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00,  9.73it/s]

Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).
  pred_masks_gpu = fill_holes_in_mask_scores(
propagate in video:   0%|          | 0/2 [00:00<?, ?it/s]

Debug: Frame 0 has 1 objects
Object IDs: [1]
propagate in video: 100%|██████████| 2/2 [00:05<00:00,  2.66s/it]

Debug: Frame 1 has 1 objects
Object IDs: [1]
Predicted bbox: [249, 150, 82, 128]

Processing image 2/49: can_chowder_000003.jpg
Ground truth bbox: [398, 92, 10