In [None]:
# ✅ Install required dependencies
!pip install torch torchvision pycocotools opencv-python numpy


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os

# ✅ Create dataset directory
!mkdir -p data/coco

# ✅ Download small COCO subset (5k images)
!wget -q http://images.cocodataset.org/zips/val2017.zip -P data/coco/
!unzip -q data/coco/val2017.zip -d data/coco/

# ✅ Download COCO Annotations
!wget -q http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P data/coco/
!unzip -q data/coco/annotations_trainval2017.zip -d data/coco/

print("✅ COCO dataset downloaded and extracted.")


✅ COCO dataset downloaded and extracted.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.ops import RoIAlign
from torch.utils.data import DataLoader, Dataset
import numpy as np
import cv2
import json
import time
from PIL import Image
from pycocotools.coco import COCO
import torch.nn.functional as F


In [None]:
# ✅ Set Device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True  # ✅ Enable CuDNN optimization for speed

print(f"✅ Using device: {device}")


✅ Using device: cpu


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.ops import RoIAlign
from torch.utils.data import DataLoader, Dataset
import numpy as np
import cv2
import os
import json
import time
from PIL import Image
from pycocotools.coco import COCO

# ✅ Set Device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

# ✅ COCO Dataset Class (Optimized)
class COCODataset(Dataset):
    def __init__(self, image_dir, annotation_file, transform=None):
        self.image_dir = image_dir
        self.coco = COCO(annotation_file)
        self.img_ids = list(self.coco.imgs.keys())[:1000]  # ✅ Use only 1000 images
        self.transform = transform
        self.img_size = (128, 128)  # ✅ Resize to 128x128 for faster training

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)

        img_info = self.coco.loadImgs(img_id)[0]
        img_path = os.path.join(self.image_dir, img_info['file_name'])
        image = Image.open(img_path).convert("RGB")

        # ✅ Resize to 128x128 for faster processing
        image = image.resize(self.img_size, Image.BILINEAR)

        # Get bounding boxes & masks
        boxes, masks, labels = [], [], []

        for ann in anns:
            x, y, w, h = ann['bbox']
            x1, y1, x2, y2 = x, y, x + w, y + h
            boxes.append([x1, y1, x2, y2])

            mask = self.coco.annToMask(ann)
            mask = cv2.resize(mask, self.img_size)  # Resize mask
            masks.append(mask)

            labels.append(ann['category_id'])

        # ✅ Handle Empty Annotations
        if len(boxes) == 0 or len(labels) == 0:
            return self.__getitem__((idx + 1) % len(self.img_ids))  # Skip empty images

        # ✅ Convert to tensors with correct shape
        boxes = torch.tensor(boxes, dtype=torch.float32).reshape(-1, 4)  # ✅ Shape (N, 4)
        masks = torch.tensor(np.array(masks), dtype=torch.uint8).unsqueeze(1)  # ✅ Shape (N, 1, H, W)
        labels = torch.tensor(labels, dtype=torch.int64).flatten()  # ✅ Shape (N,)

        if self.transform:
            image = self.transform(image)

        return image, boxes, masks, labels

# ✅ Transforms
transform = transforms.Compose([
    transforms.ToTensor()
])

# ✅ Create Dataset & DataLoader (Optimized)
train_dataset = COCODataset("data/coco/val2017", "data/coco/annotations/instances_val2017.json", transform)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))  # ✅ Reduced batch size to 2


Using Device: cpu
loading annotations into memory...
Done (t=1.73s)
creating index...
index created!


In [None]:
# ✅ Transforms
transform = transforms.Compose([
    transforms.ToTensor()
])

# ✅ Create Dataset & DataLoader (Optimized)
train_dataset = COCODataset("data/coco/val2017", "data/coco/annotations/instances_val2017.json", transform)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))  # ✅ Reduced batch size to 2

print("✅ COCO dataset loaded into DataLoader.")


loading annotations into memory...
Done (t=0.85s)
creating index...
index created!
✅ COCO dataset loaded into DataLoader.


In [None]:
# ✅ Mask R-CNN Model Definition
class MaskRCNN(nn.Module):
    def __init__(self, num_classes=91):
        super(MaskRCNN, self).__init__()
        self.backbone = models.resnet18(pretrained=True)  # ✅ Using ResNet-18 for speed
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])

        self.rpn = nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1)
        self.rpn_cls = nn.Conv2d(256, 9*2, kernel_size=1)
        self.rpn_reg = nn.Conv2d(256, 9*4, kernel_size=1)

        self.roi_align = RoIAlign(output_size=(7,7), spatial_scale=1/16, sampling_ratio=-1)

        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )
        self.bbox_reg = nn.Sequential(
            nn.Linear(512 * 7 * 7, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes * 4)
        )
        self.mask_branch = nn.Sequential(
            nn.Conv2d(512, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 80, kernel_size=1)
        )

    def forward(self, x, proposals):
        features = self.backbone(x)
        roi_pooled = self.roi_align(features, proposals)
        roi_pooled_flattened = roi_pooled.view(roi_pooled.size(0), -1)

        class_logits = self.classifier(roi_pooled_flattened)
        bbox_deltas = self.bbox_reg(roi_pooled_flattened)
        mask_logits = self.mask_branch(features)

        return class_logits, bbox_deltas, mask_logits

# ✅ Load Model & Optimizer
model = MaskRCNN().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=0.0001)  # ✅ AdamW for faster convergence
loss_fn_cls = nn.CrossEntropyLoss()
loss_fn_bbox = nn.SmoothL1Loss()
loss_fn_mask = nn.BCEWithLogitsLoss()


In [None]:
# ✅ Load Model & Optimizer
model = MaskRCNN().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=0.0001)  # ✅ Using AdamW for faster convergence
loss_fn_cls = nn.CrossEntropyLoss()
loss_fn_bbox = nn.SmoothL1Loss()
loss_fn_mask = nn.BCEWithLogitsLoss()
scaler = torch.cuda.amp.GradScaler()  # ✅ Enable Mixed Precision Training


  scaler = torch.cuda.amp.GradScaler()  # ✅ Enable Mixed Precision Training


In [None]:
# ✅ Training Loop (Optimized & Cleaned)
epochs = 3
accumulate_steps = 2  # ✅ Gradient accumulation

for epoch in range(epochs):
    model.train()
    start_time = time.time()
    total_loss, correct_preds, total_preds = 0, 0, 0

    for batch_idx, (images, boxes, masks, labels) in enumerate(train_loader):
        images = torch.stack([img.to(device) for img in images])
        boxes = [b.to(device) for b in boxes]
        masks = [m.to(device) for m in masks]
        labels = [l.to(device) for l in labels]

        num_proposals = min(6, max(len(b) for b in boxes))
        proposals, proposal_labels, proposal_boxes = [], [], []

        for i, (b, l) in enumerate(zip(boxes, labels)):
            if len(b) == 0:
                continue
            b_idx = torch.full((b.shape[0], 1), i, dtype=torch.float32, device=device)
            formatted_boxes = torch.cat((b_idx, b[:, :4]), dim=1)
            proposals.append(formatted_boxes[:num_proposals])
            proposal_labels.append(l[:num_proposals])
            proposal_boxes.append(b[:num_proposals])

        # ✅ Ensure non-empty proposals
        proposal_tensors = {
            "proposals": (proposals, (1, 5), torch.float32),
            "proposal_labels": (proposal_labels, (1,), torch.int64),
            "proposal_boxes": (proposal_boxes, (1, 4), torch.float32),
        }

        for key, (tensor_list, default_shape, dtype) in proposal_tensors.items():
            if len(tensor_list) > 0:
                locals()[key] = torch.cat(tensor_list, dim=0).to(device)
            else:
                locals()[key] = torch.zeros(default_shape, dtype=dtype, device=device)

        # ✅ Forward Pass
        outputs = model(images, proposals)
        class_logits, bbox_deltas, mask_logits = outputs

        # ✅ Compute Losses
        loss_cls = loss_fn_cls(class_logits, proposal_labels)
        selected_bbox_deltas = bbox_deltas.view(-1, 91, 4)
        predicted_classes = torch.argmax(class_logits, dim=1)
        selected_bbox_deltas = selected_bbox_deltas[torch.arange(selected_bbox_deltas.size(0)), predicted_classes]
        loss_bbox = loss_fn_bbox(selected_bbox_deltas, proposal_boxes)
        # Resize ground truth mask to match the model's mask prediction size
        resized_masks = F.interpolate(masks[0].float(), size=(4, 4), mode="bilinear", align_corners=False)

        # Ensure correct shape
        if resized_masks.shape[1] != mask_logits.shape[1]:
            resized_masks = resized_masks.expand(-1, mask_logits.shape[1], -1, -1)

        # Compute mask loss with resized target
        # ✅ Ensure ground truth masks match the number of proposals
        num_proposals = mask_logits.shape[0]  # Get the number of proposals (RoIs)

        # ✅ Resize masks and select the correct batch size
        resized_masks = F.interpolate(masks[0].float(), size=(4, 4), mode="bilinear", align_corners=False)[:num_proposals]

        # ✅ Ensure class dimension matches
        if resized_masks.shape[1] != mask_logits.shape[1]:
            resized_masks = resized_masks.expand(-1, mask_logits.shape[1], -1, -1)

        # ✅ Compute mask loss
        # Ensure resized_masks batch size matches mask_logits batch size
        if resized_masks.shape[0] < mask_logits.shape[0]:
            pad_size = mask_logits.shape[0] - resized_masks.shape[0]
            padding = torch.zeros((pad_size, 80, 4, 4), dtype=resized_masks.dtype, device=resized_masks.device)
            resized_masks = torch.cat([resized_masks, padding], dim=0)
        elif resized_masks.shape[0] > mask_logits.shape[0]:
            resized_masks = resized_masks[:mask_logits.shape[0]]  # Trim excess

        loss_mask = loss_fn_mask(mask_logits, resized_masks)


        # ✅ Total Loss
        loss = loss_cls + loss_bbox + loss_mask

        # ✅ Handle NaN/Infinity Loss & Gradient Update
        if not (torch.isnan(loss) or torch.isinf(loss)):
            loss.backward()
            if (batch_idx + 1) % accumulate_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item()

            # ✅ Track Accuracy
            min_size = min(proposal_labels.numel(), class_logits.shape[0])
            correct_preds += (torch.argmax(class_logits[:min_size], dim=1) == proposal_labels[:min_size]).sum().item()
            total_preds += min_size

        if batch_idx % 10 == 0:
            print(f"🟢 Epoch {epoch+1}, Batch {batch_idx}/{len(train_loader)}: Loss {loss.item():.4f}")

    # ✅ Compute Training Accuracy
    train_accuracy = correct_preds / total_preds if total_preds > 0 else 0.0
    epoch_time = time.time() - start_time

    print(f"✅ Epoch {epoch+1} Done: Loss {total_loss:.4f}, Train Acc: {train_accuracy:.4f}, Time: {epoch_time:.2f}s")


🟢 Epoch 1, Batch 0/500: Loss 171.1923
🟢 Epoch 1, Batch 10/500: Loss 295.4829
🟢 Epoch 1, Batch 20/500: Loss 356.3213
🟢 Epoch 1, Batch 30/500: Loss 321.7159
🟢 Epoch 1, Batch 40/500: Loss 298.6768
🟢 Epoch 1, Batch 50/500: Loss 300.3774
🟢 Epoch 1, Batch 60/500: Loss 230.6032
🟢 Epoch 1, Batch 70/500: Loss 298.3897
🟢 Epoch 1, Batch 80/500: Loss 249.2469
🟢 Epoch 1, Batch 90/500: Loss 256.9485
🟢 Epoch 1, Batch 100/500: Loss 259.0771
🟢 Epoch 1, Batch 110/500: Loss 249.7117
🟢 Epoch 1, Batch 120/500: Loss 295.4006
🟢 Epoch 1, Batch 130/500: Loss 266.6893
🟢 Epoch 1, Batch 140/500: Loss 361.4937
🟢 Epoch 1, Batch 150/500: Loss 266.8705
🟢 Epoch 1, Batch 160/500: Loss 158.3012
🟢 Epoch 1, Batch 170/500: Loss 459.1813
🟢 Epoch 1, Batch 180/500: Loss 235.3674
🟢 Epoch 1, Batch 190/500: Loss 306.2016
🟢 Epoch 1, Batch 200/500: Loss 231.2944
🟢 Epoch 1, Batch 210/500: Loss 252.7033
🟢 Epoch 1, Batch 220/500: Loss 215.6914
🟢 Epoch 1, Batch 230/500: Loss 362.9720
🟢 Epoch 1, Batch 240/500: Loss 274.4989
🟢 Epoch 1, 

In [None]:
torch.save(model.state_dict(), "maskrcnn_trained.pth")
