# 1. Introduction
This notebook outlines the steps to convert the COCO dataset for pedestrian detection and segmentation into a structured format. The process involves:

* Filtering the dataset to retain only the "person" category.
* Storing original images and corresponding segmentation masks in separate folders.
* Splitting the dataset into three subsets: training, validation, and testing.

By applying this conversion, the COCO dataset can be used alongside the PennFudanPed dataset for pedestrian detection and segmentation tasks.

# 2. Importing Libraries

In [None]:
# Generic libraries
import os
import json
import urllib.request
import zipfile
from pathlib import Path

# Torchvision libraries
from torchvision import datasets

# Import custom libraries
from utils.classification_utils import set_seeds
from utils.coco_dataset_utils import COCO_2_ImgMsk, select_and_copy_samples, split_dataset

# Warnings
import warnings
os.environ['TORCH_USE_CUDA_DSA'] = "1"
warnings.filterwarnings("ignore", category=UserWarning, module="torch.autograd.graph")
warnings.filterwarnings("ignore", category=FutureWarning, module="onnxscript.converter")

# Create target model directory
MODEL_DIR = Path("outputs")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Set seeds
SEED = 42
set_seeds(SEED)

DOWNLOAD_COCO = False
PROCESS_COCO = False

# 2. Downloading the COCO Dataset

In [None]:
if DOWNLOAD_COCO:
    # Define download URLs
    coco_urls = {
        "val_images": "http://images.cocodataset.org/zips/val2017.zip",
        "test_images": "http://images.cocodataset.org/zips/test2017.zip",
        "train_images": "http://images.cocodataset.org/zips/train2017.zip",
        "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    }

    # Create a directory to store the dataset
    dataset_dir = "d:/Repos/coco_dataset"
    os.makedirs(dataset_dir, exist_ok=True)

    # Download function
    def download_coco(url, filename):
        filepath = os.path.join(dataset_dir, filename)
        if not os.path.exists(filepath):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(url, filepath)
            print(f"Saved to {filepath}")
        else:
            print(f"{filename} already exists.")

    # Download all files
    for key, url in coco_urls.items():
        filename = url.split("/")[-1]
        download_coco(url, filename)

In [None]:
if DOWNLOAD_COCO:
    # Unzip the file
    PATH = Path(dataset_dir)
    
    zip_file = PATH / "val2017.zip"
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)

    if zip_file.exists():
        os.remove(zip_file)

    zip_file = PATH / "annotations_trainval2017.zip"
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)

    if zip_file.exists():
        os.remove(zip_file)

    zip_file = PATH / "test2017.zip"
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)

    if zip_file.exists():
        os.remove(zip_file)

    zip_file = PATH / "train2017.zip"
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(dataset_dir)

    if zip_file.exists():
        os.remove(zip_file)

# 3. Processing the COCO Dataset for Driving Segmentation

In [None]:
if PROCESS_COCO:
    # Path to COCO annotations file
    ANNOTATIONS_PATH = r"D:\Repos\coco_dataset\annotations\instances_train2017.json"

    # Load COCO annotations
    with open(ANNOTATIONS_PATH, "r") as f:
        coco_data = json.load(f)

    # Extract category ID to name mapping
    categories = {c["id"]: c["name"] for c in coco_data["categories"]}

    # Display all categories
    for cat_id, cat_name in categories.items():
        print(f"{cat_id}: {cat_name}")

In [None]:
target_categories=['car', 'traffic_light', 'stop_sign', 'motorcycle', 'bicycle', 'bus', 'truck']

In [None]:
if PROCESS_COCO:
    # Training dataset
    COCO_2_ImgMsk(
        coco_images_path=      r"D:\Repos\coco_dataset\train2017",
        coco_annotations_path= r"D:\Repos\coco_dataset\annotations\instances_train2017.json",
        output_images_dir=     r"D:\Repos\coco_dataset\driving\PNGImages",
        output_masks_dir=      r"D:\Repos\coco_dataset\driving\PedMasks",
        selected_categories=   target_categories,
        reset_category_ids=    True,
        label=                 "train"
    )

In [None]:
if PROCESS_COCO:
    # Training dataset
    COCO_2_ImgMsk(
        coco_images_path=      r"D:\Repos\coco_dataset\val2017",
        coco_annotations_path= r"D:\Repos\coco_dataset\annotations\instances_val2017.json",
        output_images_dir=     r"D:\Repos\coco_dataset\driving\PNGImages",
        output_masks_dir=      r"D:\Repos\coco_dataset\driving\PedMasks",
        selected_categories=   target_categories,
        reset_category_ids=    True,
        label=                 "val"
    )

In [None]:
if PROCESS_COCO:
    # Split dataset into train (80%), validation (10%), and test (10%) sets
    split_dataset(
        src_images=       r"D:\Repos\coco_dataset\driving\PNGImages",
        src_masks=        r"D:\Repos\coco_dataset\driving\PedMasks",
        dst_train_images= r"C:\Users\ssre_\Projects\torchsuite\data\driving\train\PNGImages",
        dst_train_masks=  r"C:\Users\ssre_\Projects\torchsuite\data\driving\train\PedMasks",
        dst_val_images=   r"C:\Users\ssre_\Projects\torchsuite\data\driving\val\PNGImages",
        dst_val_masks=    r"C:\Users\ssre_\Projects\torchsuite\data\driving\val\PedMasks",
        dst_test_images=  r"C:\Users\ssre_\Projects\torchsuite\data\driving\test\PNGImages",
        dst_test_masks=   r"C:\Users\ssre_\Projects\torchsuite\data\driving\test\PedMasks",
        train_pct=        0.80,
        val_pct=          0.10,
        test_pct=         0.10,
        seed=             SEED
    )

What Information Do x1 and x2 Contain?
x1: The UpSampled Feature Map

* x1 is the output from the previous layer after upsampling using ConvTranspose2d.
* It comes from a deeper layer of the network (lower resolution, more abstract features).
* It contains high-level, semantic information about objects and structures in the image.
* Since it's upsampled, it lacks fine-grained spatial details (edges, textures).
* x2: The Skip Connection from the Encoder

* x2 is taken from an earlier encoder layer before downsampling.
* It has a higher spatial resolution (more fine details).
* It contains low-level features such as edges, textures, and shapes.
* Since it comes from an early stage, it lacks deep semantic meaning.