### COCO Dataset Format

The COCO (Common Objects in Context) dataset format is widely used for object detection, segmentation, and keypoint detection tasks. The dataset is stored in a JSON file with the following key components:

1. **Images**:
    - Contains metadata for each image, such as:
      - `id`: A unique identifier for the image.
      - `file_name`: The name of the image file.
      - `height`: The height of the image in pixels.
      - `width`: The width of the image in pixels.

2. **Annotations**:
    - Contains annotations for objects within each image, such as:
      - `id`: A unique identifier for the annotation.
      - `image_id`: The ID of the image to which this annotation belongs.
      - `category_id`: The ID of the category this object belongs to.
      - `bbox`: (Optional) The bounding box of the object `[x, y, width, height]`.
      - `segmentation`: (Optional) The segmentation mask for the object.
      - `area`: (Optional) The area of the object in pixels.
      - `iscrowd`: Indicates whether the annotation represents a crowd (1) or a single object (0).

3. **Categories**:
    - Contains a list of categories for classification, including:
      - `id`: A unique identifier for the category.
      - `name`: The name of the category (e.g., "cat", "dog").

This format is designed to facilitate various computer vision tasks by organizing images, annotations, and categories in a standardized way, making it easier to train and evaluate models on diverse datasets.


In [1]:
import json
import sqlite3
from PIL import Image
import numpy as np
from pycocotools import mask as maskUtils
import cv2
from sklearn.model_selection import train_test_split
import shutil
import os

In [2]:
# Connect to your database
conn = sqlite3.connect('dataset.db')
cursor = conn.cursor()

In [3]:
# Step 1: Extract unique class names from segmentation annotations
cursor.execute("SELECT DISTINCT name FROM annotations WHERE type='segmentation'")
unique_class_names = cursor.fetchall()

# Step 2: Create the `categories` table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS categories (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT UNIQUE NOT NULL
    )
""")

# Step 3: Insert unique class names into the `categories` table
for class_name in unique_class_names:
    cursor.execute("INSERT OR IGNORE INTO categories (name) VALUES (?)", (class_name[0],))


In [4]:
# Fetch images
cursor.execute("SELECT id, filename, height, width FROM images")
images = []
for row in cursor.fetchall():
    images.append({
        "id": row[0],
        "file_name": row[1],
        "height": row[2],
        "width": row[3]
    })

In [5]:
# Step 2: Fetch categories
cursor.execute("SELECT id, name FROM categories")
categories = []
category_id_map = {}  # Map category name to ID for fast lookup
for row in cursor.fetchall():
    category_id_map[row[1]] = row[0] - 1
    categories.append({
        "id": row[0] - 1,
        "name": row[1],
    })

In [6]:
# Step 3: Fetch annotations
cursor.execute("SELECT id, name, value, imageID FROM annotations WHERE type='segmentation'")
annotations = []
for row in cursor.fetchall():
    annotation_id, class_name, mask_path, image_id = row
    
    # Get the corresponding category ID
    category_id = category_id_map[class_name]
    
    # Fetch the image size from the database or any other source
    cursor.execute("SELECT width, height FROM images WHERE id=?", (image_id,))
    image_width, image_height = cursor.fetchone()  # Assuming the size is stored in the 'images' table

    # Open the mask image and resize it to match the image size
    with Image.open('datasets/masks/' + mask_path) as mask:
        mask = mask.convert("L")  # Ensure it's a grayscale image
        original_size = mask.size  # Save the original size for reference if needed
        mask = mask.resize((image_width, image_height), Image.NEAREST)  # Resize the mask to match the image size

        mask_array = np.array(mask)
        binary_mask = mask_array > 0  # Convert to binary mask

        # Find contours (external contours only)
        contours, _ = cv2.findContours(binary_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Convert contours to COCO format (list of lists)
        segmentation = []
        for contour in contours:
            contour = contour.flatten().tolist()  # Flatten the contour array
            if len(contour) > 4:  # Valid polygon must have at least 3 points (6 coordinates)
                segmentation.append(contour)
        
        # Calculate the area using the binary mask
        area = np.sum(binary_mask)
        
        # Calculate the bounding box
        x, y, w, h = cv2.boundingRect(binary_mask.astype(np.uint8))
        bbox = [x, y, w, h]

    # Add the annotation to the list
    annotations.append({
        "id": annotation_id,
        "image_id": image_id,
        "category_id": category_id,
        "segmentation": segmentation,  # List of polygon points
        "area": int(area),
        "iscrowd": 0,
        "bbox": [[int(coord) for coord in bbox]],  # Convert to list of integers
    })


In [7]:
# Step 4: Assemble the COCO dataset
coco_dataset = {
    "images": images,
    "annotations": annotations,
    "categories": categories
}

In [8]:
# Step 5: Save the COCO dataset to a JSON file
with open('datasets/coco/coco_dataset.json', 'w') as f:
    json.dump(coco_dataset, f, indent=4)

In [9]:
# Close the database connection
conn.close()

In [None]:
import os
import json
from datetime import datetime

def get_file_modification_time(file_path):
    return datetime.fromtimestamp(os.path.getmtime(file_path))

def combine_folders_and_update_coco(coco_json_path, input_folder, output_folder):
    # Load the COCO dataset
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)

    images = coco_data['images']
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    seen_files = {}

    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')):
                src_file_path = os.path.join(root, file)
                dst_file_path = os.path.join(output_folder, file)

                # Check if the file already exists in the destination folder
                if file in seen_files:
                    existing_file_path = seen_files[file]
                    # Compare modification times and keep the more recent one
                    if get_file_modification_time(src_file_path) > get_file_modification_time(existing_file_path):
                        os.remove(dst_file_path)
                        shutil.copy2(src_file_path, dst_file_path)
                        seen_files[file] = src_file_path
                        print(f"Replaced {existing_file_path} with more recent {src_file_path}")
                    else:
                        print(f"Skipped {src_file_path} as {existing_file_path} is more recent")
                else:
                    shutil.copy2(src_file_path, dst_file_path)
                    seen_files[file] = src_file_path
                    print(f"Copied {src_file_path} to {dst_file_path}")

    # Update the file_name field in the COCO JSON
    for image in images:
        image_file_name = os.path.basename(image['file_name'])
        if image_file_name in seen_files:
            image['file_name'] = os.path.join('images', image_file_name)

    # Save the updated COCO JSON file
    updated_coco_json_path = os.path.join(output_folder, 'updated_coco_dataset.json')
    with open(updated_coco_json_path, 'w') as f:
        json.dump(coco_data, f, indent=4)

    print(f"COCO dataset updated and saved to {updated_coco_json_path}")

# Example usage
coco_json_path = 'D:/UoL/Final Project/src/datasets/coco/coco_dataset.json'
input_folder = "D:/UoL/Final Project/src/datasets/IOP's"
output_folder = 'D:/UoL/Final Project/src/datasets/images'
combine_folders_and_update_coco(coco_json_path, input_folder, output_folder)


## create train test and validation splits for yolo

In [27]:
# Path to your COCO JSON file
coco_json_path = 'D:/UoL/Final Project/src/datasets/updated_coco_dataset.json'

# Load the COCO JSON file
with open(coco_json_path, 'r') as f:
    coco_data = json.load(f)

# Extract class names from the 'categories' field
classes = [category['name'] for category in coco_data['categories']]

# Print the list of classes
print("Classes found in the dataset:")
for idx, class_name in enumerate(classes, start=1):
    print(f"{idx-1}: {class_name}")


Classes found in the dataset:
0: Bone Loss
1: Restorations
2: Periapical Abnormality
3: Fracture
4: Implants
5: Prosthetics
6: Dental Caries
7: Absent Tooth
8: Impacted Tooth
9: other
10: Position
11: Root Stump
12: Altered Morphology
13: Attrition
14: Microdontia


In [28]:
import json
import os
from sklearn.model_selection import train_test_split

In [None]:

def coco_to_yolo_segmentation(coco_annotation, image_width, image_height):
    # Convert COCO segmentation to YOLO format (normalized coordinates)
    yolo_segments = []
    for segment in coco_annotation:
        yolo_segment = []
        for i in range(0, len(segment), 2):
            x = segment[i] / image_width
            y = segment[i + 1] / image_height
            yolo_segment.extend([x, y])
        yolo_segments.append(yolo_segment)
    return yolo_segments

def create_folders_and_txt_files(coco_json_path, output_dir, train_size=0.7, val_size=0.2, test_size=0.1, random_state=42):
    # Load the COCO dataset
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)
    
    images = coco_data['images']
    annotations = coco_data['annotations']
    categories = {category['id']: category['name'] for category in coco_data['categories']}
    
    # Split the dataset into train, val, and test sets
    train_images, test_images = train_test_split(images, train_size=train_size, random_state=random_state)
    val_images, test_images = train_test_split(test_images, train_size=val_size/(val_size + test_size), random_state=random_state)

    # Mapping from image_id to annotations
    image_to_annotations = {}
    for annotation in annotations:
        image_id = annotation['image_id']
        if image_id not in image_to_annotations:
            image_to_annotations[image_id] = []
        image_to_annotations[image_id].append(annotation)
    
    # Create directories and process images/labels
    for split_name, split_images in [('train', train_images), ('valid', val_images), ('test', test_images)]:
        image_dir = os.path.join(output_dir, split_name, 'images')
        label_dir = os.path.join(output_dir, split_name, 'labels')
        os.makedirs(image_dir, exist_ok=True)
        os.makedirs(label_dir, exist_ok=True)

        for image in split_images:
            image_id = image['id']
            file_name = image['file_name']
            src_image_path = os.path.join(os.path.dirname(coco_json_path), file_name)
            dst_image_path = os.path.join(image_dir, os.path.basename(file_name))

            # Copy the image to the relevant directory
            if os.path.exists(src_image_path):
                shutil.copy(src_image_path, dst_image_path)

                # Create the corresponding label file
                label_file_path = os.path.join(label_dir, os.path.splitext(os.path.basename(file_name))[0] + '.txt')
                with open(label_file_path, 'w') as label_file:
                    image_annotations = image_to_annotations.get(image_id, [])
                    for annotation in image_annotations:
                        if 'segmentation' in annotation:
                            segments = coco_to_yolo_segmentation(annotation['segmentation'], image['width'], image['height'])
                            category_id = annotation['category_id']
                            class_name = categories[category_id]
                            class_index = list(categories.values()).index(class_name)
                            
                            for segment in segments:
                                label_file.write(f"{class_index} {' '.join(map(str, segment))}\n")
            else:
                print(f"Warning: Image {src_image_path} not found. Skipping...")

    print("Dataset has been split, folders created, and YOLO-format segmentation labels generated.")

# Example usage
output_dir = 'D:/UoL/Final Project/src/datasets/images'
coco_json_path = 'D:/UoL/Final Project/src/datasets/updated_coco_dataset.json'
create_folders_and_txt_files(coco_json_path, output_dir)

## create train test and validation splits for coco

In [14]:
import os
import json
import random
import shutil
from sklearn.model_selection import train_test_split

def add_bounding_boxes_to_annotations(coco_annotations):
    for annotation in coco_annotations:
        if 'bbox' not in annotation:
            segmentation = annotation.get('segmentation', [])
            if segmentation:
                min_x = min(segmentation[0][::2])
                min_y = min(segmentation[0][1::2])
                max_x = max(segmentation[0][::2])
                max_y = max(segmentation[0][1::2])
                bbox = [min_x, min_y, max_x - min_x, max_y - min_y]
                annotation['bbox'] = bbox
    return coco_annotations

def split_dataset(coco_data, train_ratio=0.7, val_ratio=0.2):
    images = coco_data['images']
    annotations = coco_data['annotations']

    train_images, test_images = train_test_split(images, train_size=train_ratio)
    val_images, test_images = train_test_split(test_images, train_size=val_ratio / (1 - train_ratio))

    def filter_annotations(image_set):
        image_ids = {image['id'] for image in image_set}
        return [annotation for annotation in annotations if annotation['image_id'] in image_ids]

    train_annotations = filter_annotations(train_images)
    val_annotations = filter_annotations(val_images)
    test_annotations = filter_annotations(test_images)

    return {
        'train': {'images': train_images, 'annotations': train_annotations},
        'val': {'images': val_images, 'annotations': val_annotations},
        'test': {'images': test_images, 'annotations': test_annotations},
    }

def save_coco_split(coco_split, image_dir, output_dir):
    for split_name, data in coco_split.items():
        output_path = os.path.join(output_dir, split_name)
        os.makedirs(os.path.join(output_path, 'images'), exist_ok=True)
        os.makedirs(output_path, exist_ok=True)

        with open(os.path.join(output_path, f'{split_name}_coco.json'), 'w') as f:
            json.dump({
                'images': data['images'],
                'annotations': data['annotations'],
                'categories': coco_data['categories'],
            }, f, indent=4)

        for image in data['images']:
            src_image_path = os.path.join(image_dir, image['file_name'])
            dst_image_path = os.path.join(output_path, 'images', os.path.basename(image['file_name']))

            # Handle missing files gracefully
            if not os.path.exists(src_image_path):
                print(f"Warning: File {src_image_path} not found. Skipping...")
                continue

            shutil.copy(src_image_path, dst_image_path)

# Example usage
input_json_path = 'D:/UoL/Final Project/src/datasets/coco/updated_coco_dataset.json'
image_dir = 'D:/UoL/Final Project/src/datasets'
output_dir = 'D:/UoL/Final Project/src/datasets/coco'

# Load the COCO JSON file
with open(input_json_path, 'r') as f:
    coco_data = json.load(f)

# Add bounding boxes if missing
coco_data['annotations'] = add_bounding_boxes_to_annotations(coco_data['annotations'])

# Split the dataset
coco_split = split_dataset(coco_data)

# Save the split dataset and copy images, with error handling for missing files
save_coco_split(coco_split, image_dir, output_dir)


