### COCO Dataset Format

The COCO (Common Objects in Context) dataset format is widely used for object detection, segmentation, and keypoint detection tasks. The dataset is stored in a JSON file with the following key components:

1. **Images**:
    - Contains metadata for each image, such as:
      - `id`: A unique identifier for the image.
      - `file_name`: The name of the image file.
      - `height`: The height of the image in pixels.
      - `width`: The width of the image in pixels.

2. **Annotations**:
    - Contains annotations for objects within each image, such as:
      - `id`: A unique identifier for the annotation.
      - `image_id`: The ID of the image to which this annotation belongs.
      - `category_id`: The ID of the category this object belongs to.
      - `bbox`: (Optional) The bounding box of the object `[x, y, width, height]`.
      - `segmentation`: (Optional) The segmentation mask for the object.
      - `area`: (Optional) The area of the object in pixels.
      - `iscrowd`: Indicates whether the annotation represents a crowd (1) or a single object (0).

3. **Categories**:
    - Contains a list of categories for classification, including:
      - `id`: A unique identifier for the category.
      - `name`: The name of the category (e.g., "cat", "dog").

This format is designed to facilitate various computer vision tasks by organizing images, annotations, and categories in a standardized way, making it easier to train and evaluate models on diverse datasets.


In [1]:
import json
import sqlite3
from PIL import Image
import numpy as np
from pycocotools import mask as maskUtils
import cv2
from sklearn.model_selection import train_test_split
import shutil
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Connect to your database
conn = sqlite3.connect('dataset.db')
cursor = conn.cursor()

In [3]:
# Step 1: Extract unique class names from segmentation annotations
cursor.execute("SELECT DISTINCT name FROM annotations WHERE type='segmentation'")
unique_class_names = cursor.fetchall()

# Step 2: Create the `categories` table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS categories (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT UNIQUE NOT NULL
    )
""")

# Step 3: Insert unique class names into the `categories` table
for class_name in unique_class_names:
    cursor.execute("INSERT OR IGNORE INTO categories (name) VALUES (?)", (class_name[0],))


In [4]:
# Fetch images
cursor.execute("SELECT id, filename, height, width FROM images")
images = []
for row in cursor.fetchall():
    images.append({
        "id": row[0],
        "file_name": row[1],
        "height": row[2],
        "width": row[3]
    })

In [5]:
# Step 2: Fetch categories
cursor.execute("SELECT id, name FROM categories")
categories = []
category_id_map = {}  # Map category name to ID for fast lookup
for row in cursor.fetchall():
    category_id_map[row[1]] = row[0] - 1
    categories.append({
        "id": row[0] - 1,
        "name": row[1],
    })

In [6]:
# Step 3: Fetch annotations
cursor.execute("SELECT id, name, value, imageID FROM annotations WHERE type='segmentation'")
annotations = []
for row in cursor.fetchall():
    annotation_id, class_name, mask_path, image_id = row
    
    # Get the corresponding category ID
    category_id = category_id_map[class_name]
    
    # Fetch the image size from the database or any other source
    cursor.execute("SELECT width, height FROM images WHERE id=?", (image_id,))
    image_width, image_height = cursor.fetchone()  # Assuming the size is stored in the 'images' table

    # Open the mask image and resize it to match the image size
    with Image.open('datasets/masks/' + mask_path) as mask:
        mask = mask.convert("L")  # Ensure it's a grayscale image
        original_size = mask.size  # Save the original size for reference if needed
        mask = mask.resize((image_width, image_height), Image.NEAREST)  # Resize the mask to match the image size

        mask_array = np.array(mask)
        binary_mask = mask_array > 0  # Convert to binary mask

        # Find contours (external contours only)
        contours, _ = cv2.findContours(binary_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Convert contours to COCO format (list of lists)
        segmentation = []
        for contour in contours:
            contour = contour.flatten().tolist()  # Flatten the contour array
            if len(contour) > 4:  # Valid polygon must have at least 3 points (6 coordinates)
                segmentation.append(contour)
        
        # Calculate the area using the binary mask
        area = np.sum(binary_mask)
        
        # Calculate the bounding box
        x, y, w, h = cv2.boundingRect(binary_mask.astype(np.uint8))
        bbox = [x, y, w, h]

    # Add the annotation to the list
    annotations.append({
        "id": annotation_id,
        "image_id": image_id,
        "category_id": category_id,
        "segmentation": segmentation,  # List of polygon points
        "area": int(area),
        "iscrowd": 0,
        "bbox": [[int(coord) for coord in bbox]],  # Convert to list of integers
    })


In [7]:
# Step 4: Assemble the COCO dataset
coco_dataset = {
    "images": images,
    "annotations": annotations,
    "categories": categories
}

In [None]:
#move images into one folder and update coco json
def get_image_size(image_path):
    with Image.open(image_path) as img:
        return img.size

def get_file_modification_time(file_path):
    return os.path.getmtime(file_path)

def process_image_file(file, root, images, seen_files, output_folder):
    src_file_path = os.path.join(root, file)

    for image in images:
        json_file_name = os.path.basename(image['file_name'])
        if json_file_name == file:
            coco_width, coco_height = image['width'], image['height']
            src_width, src_height = get_image_size(src_file_path)

            if (src_width, src_height) == (coco_width, coco_height):
                if file in seen_files:
                    existing_file_path = seen_files[file]['path']
                    if get_file_modification_time(src_file_path) > get_file_modification_time(existing_file_path):
                        dst_file_path = os.path.join(output_folder, file)
                        if os.path.exists(dst_file_path):
                            os.remove(dst_file_path)
                        shutil.copy2(src_file_path, dst_file_path)
                        seen_files[file] = {'path': src_file_path, 'image_info': image}
                        print(f"Replaced {existing_file_path} with more recent {src_file_path}")
                    else:
                        print(f"Skipped {src_file_path} as {existing_file_path} is more recent")
                else:
                    dst_file_path = os.path.join(output_folder, file)
                    shutil.copy2(src_file_path, dst_file_path)
                    seen_files[file] = {'path': src_file_path, 'image_info': image}
                    print(f"Copied {src_file_path} to {dst_file_path}")
            else:
                print(f"Skipped {src_file_path} due to size mismatch")
            break

def combine_folders_and_update_coco(coco_dataset, input_folder, output_folder):
    # Load the COCO dataset
    coco_data = coco_dataset

    images = coco_data['images']
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    seen_files = {}

    # Using ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for root, _, files in os.walk(input_folder):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')):
                    futures.append(executor.submit(process_image_file, file, root, images, seen_files, output_folder))

        # Wait for all threads to complete
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                print(f"Generated an exception: {exc}")

    # Update the file_name field in the COCO JSON
    for image in images:
        image_file_name = os.path.basename(image['file_name'])
        if image_file_name in seen_files:
            image['file_name'] = os.path.join('images', image_file_name)

    # Save the updated COCO JSON file
    coco_json_path = os.path.join(input_folder, 'coco_dataset.json')
    with open(coco_json_path, 'w') as f:
        json.dump(coco_data, f, indent=4)

    print(f"COCO dataset updated and saved to {coco_json_path}")

# Example usage:
# combine_folders_and_update_coco("path/to/coco.json", "path/to/input/folder", "path/to/output/folder")


             
# Example usage
output_dir = 'D:/UoL/Final Project/src/datasets/images'
input_dir = 'D:/UoL/Final Project/src/datasets'
combine_folders_and_update_coco(coco_dataset, input_dir, output_dir)

In [9]:
# Close the database connection
conn.close()