# 1. Introduction
This notebook provides the necessary steps to convert the COCO dataset for object detection and segmentation into a folder structure. The original images and masks are stored in separate folders, and the dataset is then split into three categories: training, validation, and testing.

# 2. Importing Libraries

In [None]:
# Generic libraries
import os
import json
import urllib.request
import zipfile
from pathlib import Path

# Torchvision libraries
from torchvision import datasets

# Import custom libraries
from utils.classification_utils import set_seeds
from utils.coco_dataset_utils import COCO_2_ImgMsk, select_and_copy_samples, split_dataset

# Warnings
import warnings
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore", category=UserWarning, module="torch.autograd.graph")
warnings.filterwarnings("ignore", category=FutureWarning, module="onnxscript.converter")

# Create target model directory
MODEL_DIR = Path("outputs")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Set seeds
set_seeds(42)

# 2. Downloading the COCO Dataset

In [None]:
# Define download URLs
coco_urls = {
    "val_images": "http://images.cocodataset.org/zips/val2017.zip",
    "test_images": "http://images.cocodataset.org/zips/test2017.zip",
    "train_images": "http://images.cocodataset.org/zips/train2017.zip",
    "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
}

# Create a directory to store the dataset
dataset_dir = "d:/Repos/coco_dataset"
os.makedirs(dataset_dir, exist_ok=True)

# Download function
def download_coco(url, filename):
    filepath = os.path.join(dataset_dir, filename)
    if not os.path.exists(filepath):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filepath)
        print(f"Saved to {filepath}")
    else:
        print(f"{filename} already exists.")

# Download all files
for key, url in coco_urls.items():
    filename = url.split("/")[-1]
    download_coco(url, filename)

In [None]:
# Unzip the file
PATH = Path(dataset_dir)

zip_file = PATH / "val2017.zip"
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

if zip_file.exists():
    os.remove(zip_file)

zip_file = PATH / "annotations_trainval2017.zip"
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

if zip_file.exists():
    os.remove(zip_file)

zip_file = PATH / "test2017.zip"
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

if zip_file.exists():
    os.remove(zip_file)

zip_file = PATH / "train2017.zip"
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

if zip_file.exists():
    os.remove(zip_file)

In [None]:
# Define paths
IMAGES_PATH = r"D:\Repos\coco_dataset\val2017"  # Path to validation images
ANNOTATIONS_PATH = r"D:\Repos\coco_dataset\annotations\instances_val2017.json"  # Path to annotation file

# Load dataset
dataset = datasets.CocoDetection(root=IMAGES_PATH, annFile=ANNOTATIONS_PATH)

# Fetch a sample
sample = dataset[0]
img, target = sample

# Print types and structure of target annotations
print(f"{type(img) = }")          # Should be <class 'PIL.Image.Image'>
print(f"{type(target) = }")       # Should be <class 'list'>
print(f"{type(target[0]) = }")    # Should be <class 'dict'>
print(f"{target[0].keys() = }")   # Should contain keys like 'segmentation', 'bbox', 'category_id', etc.

loading annotations into memory...
Done (t=0.67s)
creating index...
index created!
type(img) = <class 'PIL.Image.Image'>
type(target) = <class 'list'>
type(target[0]) = <class 'dict'>
target[0].keys() = dict_keys(['segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id'])


In [3]:
dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks"))

sample = dataset[0]
img, target = sample
print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }")

type(img) = <class 'PIL.Image.Image'>
type(target) = <class 'dict'>
target.keys() = dict_keys(['boxes', 'masks', 'labels'])
type(target['boxes']) = <class 'torchvision.tv_tensors._bounding_boxes.BoundingBoxes'>
type(target['labels']) = <class 'torch.Tensor'>
type(target['masks']) = <class 'torchvision.tv_tensors._mask.Mask'>


# 3. Processing the COCO Dataset: Train, Validation, Test

In [4]:
# Path to COCO annotations file
ANNOTATIONS_PATH = r"D:\Repos\coco_dataset\annotations\instances_val2017.json"

# Load COCO annotations
with open(ANNOTATIONS_PATH, "r") as f:
    coco_data = json.load(f)

# Extract category ID to name mapping
categories = {c["id"]: c["name"] for c in coco_data["categories"]}

# Display all categories
for cat_id, cat_name in categories.items():
    print(f"{cat_id}: {cat_name}")

1: person
2: bicycle
3: car
4: motorcycle
5: airplane
6: bus
7: train
8: truck
9: boat
10: traffic light
11: fire hydrant
13: stop sign
14: parking meter
15: bench
16: bird
17: cat
18: dog
19: horse
20: sheep
21: cow
22: elephant
23: bear
24: zebra
25: giraffe
27: backpack
28: umbrella
31: handbag
32: tie
33: suitcase
34: frisbee
35: skis
36: snowboard
37: sports ball
38: kite
39: baseball bat
40: baseball glove
41: skateboard
42: surfboard
43: tennis racket
44: bottle
46: wine glass
47: cup
48: fork
49: knife
50: spoon
51: bowl
52: banana
53: apple
54: sandwich
55: orange
56: broccoli
57: carrot
58: hot dog
59: pizza
60: donut
61: cake
62: chair
63: couch
64: potted plant
65: bed
67: dining table
70: toilet
72: tv
73: laptop
74: mouse
75: remote
76: keyboard
77: cell phone
78: microwave
79: oven
80: toaster
81: sink
82: refrigerator
84: book
85: clock
86: vase
87: scissors
88: teddy bear
89: hair drier
90: toothbrush


In [None]:
# Training dataset
COCO_2_ImgMsk(
    coco_images_path=r"D:\Repos\coco_dataset\train2017",
    coco_annotations_path=r"D:\Repos\coco_dataset\annotations\instances_train2017.json",
    output_images_dir=r"D:\Repos\coco_dataset\coco_pennfudan\PNGImages",
    output_masks_dir=r"D:\Repos\coco_dataset\coco_pennfudan\PedMasks",
    label="train"
)

loading annotations into memory...


In [10]:
# Validation and testing dataset
COCO_2_ImgMsk(
    coco_images_path=r"D:\Repos\coco_dataset\val2017",
    coco_annotations_path=r"D:\Repos\coco_dataset\annotations\instances_val2017.json",
    output_images_dir=r"D:\Repos\coco_dataset\coco_pennfudan\PNGImages",
    output_masks_dir=r"D:\Repos\coco_dataset\coco_pennfudan\PedMasks",
    label="val"
)

loading annotations into memory...
Done (t=0.43s)
creating index...
index created!
Dataset conversion completed: 2693 images & masks saved.


In [18]:
# Subset of training dataset
select_and_copy_samples(
    input_images_dir=r"D:\Repos\coco_dataset\coco_pennfudan\PNGImages",
    input_masks_dir=r"D:\Repos\coco_dataset\coco_pennfudan\PedMasks",
    output_images_dir=r"D:\Repos\coco_dataset\coco_pennfudan_sampled\PNGImages",
    output_masks_dir=r"D:\Repos\coco_dataset\coco_pennfudan_sampled\PedMasks",
    num_samples=15000,
    seed=42
)

Successfully copied 15000 image-mask pairs to new folders.


In [19]:
# Example usage
split_dataset(
    src_images=r"D:\Repos\coco_dataset\coco_pennfudan_sampled\PNGImages",
    src_masks=r"D:\Repos\coco_dataset\coco_pennfudan_sampled\PedMasks",
    dst_train_images= r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_train\PNGImages",
    dst_train_masks=  r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_train\PedMasks",
    dst_val_images=   r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_val\PNGImages",
    dst_val_masks=    r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_val\PedMasks",
    dst_test_images=  r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_test\PNGImages",
    dst_test_masks=   r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_test\PedMasks",
    train_pct=0.7,  # 70% for training
    val_pct=0.15,   # 15% for validation
    test_pct=0.15,  # 15% for testing
    seed=42
)

In [22]:
# Example usage
split_dataset(
    src_images=r"C:\Users\ssre_\Projects\torchsuite\data\PennFudanPed\PNGImages",
    src_masks=r"C:\Users\ssre_\Projects\torchsuite\data\PennFudanPed\PedMasks",
    dst_train_images= r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_train\PNGImages",
    dst_train_masks=  r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_train\PedMasks",
    dst_val_images=   r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_val\PNGImages",
    dst_val_masks=    r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_val\PedMasks",
    dst_test_images=  r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_test\PNGImages",
    dst_test_masks=   r"C:\Users\ssre_\Projects\torchsuite\data\coco_pennfudan_sampled_test\PedMasks",
    train_pct=0.7,  # 70% for training
    val_pct=0.15,   # 15% for validation
    test_pct=0.15,  # 15% for testing
    seed=42
)

Dataset split successfully.
