In [23]:
import os
import json
import shutil
from sklearn.model_selection import train_test_split

In [3]:
coco_json = r"D:\UoL\Final Project\src\datasets\updated_coco_dataset.json"

In [8]:
import json
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from collections import Counter

# Load the COCO dataset as a JSON object
with open(coco_json, 'r') as f:
    coco_data = json.load(f)

# Extract images and annotations
images = coco_data['images']
annotations = coco_data['annotations']
categories = coco_data['categories']

# Create a dictionary to store the categories for each image
image_categories = {}

for ann in annotations:
    img_id = ann['image_id']
    cat_id = ann['category_id']
    if img_id not in image_categories:
        image_categories[img_id] = set()
    image_categories[img_id].add(cat_id)

# Count the occurrences of each class
class_counts = Counter([ann['category_id'] for ann in annotations])

# Set a minimum threshold for the number of samples per class
min_samples = 2

# Filter out all classes that do not meet the minimum threshold
common_classes = {cat_id for cat_id, count in class_counts.items() if count >= min_samples}

# Filter images that contain only common classes
filtered_image_categories = {img_id: cats.intersection(common_classes) for img_id, cats in image_categories.items()}
filtered_image_categories = {img_id: cats for img_id, cats in filtered_image_categories.items() if cats}

# Remove images with classes that have fewer than 2 occurrences
valid_image_ids = []
for img_id, cats in filtered_image_categories.items():
    if all(class_counts[cat_id] >= min_samples for cat_id in cats):
        valid_image_ids.append(img_id)

# Convert the filtered image categories to a binary matrix
image_labels = mlb.fit_transform([list(filtered_image_categories[img_id]) for img_id in valid_image_ids])

# Split the dataset into train and temp (test + val)
train_ids, temp_ids, train_labels, temp_labels = train_test_split(
    valid_image_ids, image_labels, test_size=0.3, stratify=image_labels, random_state=42
)

# Further split the temp set into validation and test sets
val_ids, test_ids, val_labels, test_labels = train_test_split(
    temp_ids, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
)

def save_coco_annotations(coco_data, img_ids, output_file):
    # Filter images
    filtered_images = [img for img in coco_data['images'] if img['id'] in img_ids]
    
    # Filter annotations
    filtered_annotations = [ann for ann in coco_data['annotations'] if ann['image_id'] in img_ids]
    
    # Create a new COCO formatted dictionary
    coco_split = {
        'images': filtered_images,
        'annotations': filtered_annotations,
        'categories': [cat for cat in coco_data['categories'] if cat['id'] in common_classes]
    }
    
    # Save to file
    with open(output_file, 'w') as f:
        json.dump(coco_split, f)

# Save the splits
save_coco_annotations(coco_data, train_ids, 'coco_train.json')
save_coco_annotations(coco_data, val_ids, 'coco_val.json')
save_coco_annotations(coco_data, test_ids, 'coco_test.json')


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [7]:
# Example usage:
split_coco_dataset(r"D:\UoL\Final Project\src\datasets\updated_coco_dataset.json", r"D:\UoL\Final Project\src\datasets")

Dataset split complete. Files saved in: D:\UoL\Final Project\src\datasets


## Create COCO folder structure 

In [30]:
def generate_coco_structure_with_renamed_images_and_filtered_annotations(coco_json_path, images_dir, output_dir):
    # Load the COCO dataset
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)
    
    # Create the required COCO directories
    images_output_dir = os.path.join(output_dir, 'images')
    
    os.makedirs(images_output_dir, exist_ok=True)
    
    # Lists to store the updated image info and valid annotations
    updated_images = []
    valid_annotations = []
    
    # Copy images to the output images directory and rename them using their ID
    for image_info in coco_data['images']:
        image_id = image_info['id']
        image_filename = image_info['file_name']
        image_ext = os.path.splitext(image_filename)[1]  # Get the file extension
        
        # Construct the new filename using the image ID
        new_image_filename = f"{image_id}{image_ext}"
        image_path = os.path.join(images_dir, image_filename)
        new_image_path = os.path.join(images_output_dir, new_image_filename)
        
        # Check if the image exists
        if os.path.exists(image_path):
            shutil.copy(image_path, new_image_path)
            image_info['file_name'] = new_image_filename  # Update the file name in the COCO data
            updated_images.append(image_info)
            
            # Include annotations that belong to this image
            image_annotations = [anno for anno in coco_data['annotations'] if anno['image_id'] == image_id]
            valid_annotations.extend(image_annotations)
        else:
            print(f"Image {image_filename} not found. Skipping...")
    
    # Update the images and annotations fields in the COCO data with valid entries
    coco_data['images'] = updated_images
    coco_data['annotations'] = valid_annotations
    
    # Save the updated COCO annotations file
    annotations_filename = os.path.splitext(os.path.basename(coco_json_path))[0] + '.json'
    annotations_output_path = os.path.join(output_dir, annotations_filename)
    
    with open(annotations_output_path, 'w') as f:
        json.dump(coco_data, f)
    
    print(f"COCO file structure with renamed images and filtered annotations generated:")
    print(f"Images saved in: {images_output_dir}")
    print(f"Updated annotations saved in: {annotations_output_path}")

In [33]:
# Example usage:
generate_coco_structure_with_renamed_images('datasets/train_coco.json', 'datasets', 'datasets/coco/train/')
generate_coco_structure_with_renamed_images('datasets/val_coco.json', 'datasets', 'datasets/coco/val/')
generate_coco_structure_with_renamed_images('datasets/test_coco.json', 'datasets', 'datasets/coco/test/')


Image images\IOPA (22673).jpg not found. Skipping...
Image images\IOPA (3265).jpg not found. Skipping...
Image images\IOPA (22788).jpg not found. Skipping...
Image IOP's/lopa_1_jpg_quality/IOPA (22038).jpg not found. Skipping...
Image images\IOPA (25830).jpg not found. Skipping...
Image images\year (2142).jpg not found. Skipping...
Image images\IOPA (7815).jpg not found. Skipping...
Image images\IOPA (27255).jpg not found. Skipping...
Image images\IOPA (20879).jpg not found. Skipping...
Image images\IOPA (21151).jpg not found. Skipping...
COCO file structure with renamed images generated:
Images saved in: datasets/coco/train/images
Updated annotations saved in: datasets/coco/train/annotations\train_coco.json
Image images\IOPA (24024).jpg not found. Skipping...
COCO file structure with renamed images generated:
Images saved in: datasets/coco/val/images
Updated annotations saved in: datasets/coco/val/annotations\val_coco.json
Image images\IOPA (9113).jpg not found. Skipping...
COCO file 

## Create Yolo set

In [27]:
def coco_segmentation_to_yolo(coco_json_path, images_dir, output_dir):
    # Load the COCO dataset
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)
    
    # Create output directories if they don't exist
    labels_dir = os.path.join(output_dir, 'labels')
    images_output_dir = os.path.join(output_dir, 'images')
    os.makedirs(labels_dir, exist_ok=True)
    os.makedirs(images_output_dir, exist_ok=True)
    
    # Create a category ID to index map
    category_id_to_index = {category['id']: idx for idx, category in enumerate(coco_data['categories'])}
    
    # Lists to keep track of valid images and annotations
    valid_images = []
    valid_annotations = []
    
    # Process each image
    for image_info in coco_data['images']:
        image_id = image_info['id']
        image_width = image_info['width']
        image_height = image_info['height']
        image_filename = image_info['file_name']
        
        # Check if the image exists
        image_path = os.path.join(images_dir, image_filename)
        if not os.path.exists(image_path):
            print(f"Image {image_filename} not found. Skipping...")
            continue
        
        # Copy image to the output images directory with the new ID-based filename
        new_filename = f"{image_id}"
        new_image_path = os.path.join(images_output_dir, f"{new_filename}{os.path.splitext(image_filename)[1]}")
        shutil.copy(image_path, new_image_path)
        
        # Filter annotations for this image
        annotations = [anno for anno in coco_data['annotations'] if anno['image_id'] == image_id]
        
        # YOLO format annotations
        yolo_annotations = []
        
        for anno in annotations:
            category_id = anno['category_id']
            category_index = category_id_to_index[category_id]
            
            # Convert segmentation points to YOLO format
            if 'segmentation' in anno:
                segmentation = anno['segmentation']
                
                if type(segmentation) == list:  # Polygon segmentation
                    for segment in segmentation:
                        segment_str = ' '.join([f"{(x / image_width):.6f} {(y / image_height):.6f}" for x, y in zip(segment[::2], segment[1::2])])
                        yolo_annotations.append(f"{category_index} {segment_str}")
        
        # Write the YOLO annotation file using the image ID as the filename
        if yolo_annotations:
            yolo_filename = os.path.join(labels_dir, f"{new_filename}.txt")
            with open(yolo_filename, 'w') as yolo_file:
                yolo_file.write("\n".join(yolo_annotations))
            
            # Add valid images and annotations
            valid_images.append(image_info)
            valid_annotations.extend(annotations)
    
    # Save the filtered COCO dataset (optional)
    filtered_coco_data = {
        "images": valid_images,
        "annotations": valid_annotations,
        "categories": coco_data['categories']
    }
    
    filtered_coco_path = os.path.join(output_dir, 'filtered_coco_dataset.json')
    with open(filtered_coco_path, 'w') as f:
        json.dump(filtered_coco_data, f)
    
    print(f"YOLO segmentation format annotations saved in: {labels_dir}")
    print(f"Images copied to: {images_output_dir}")
    print(f"Filtered COCO dataset saved as: {filtered_coco_path}")

In [28]:
# Example usage:
coco_segmentation_to_yolo('datasets/train_coco.json', 'datasets', 'datasets/yolo/train/')
coco_segmentation_to_yolo('datasets/val_coco.json', 'datasets', 'datasets/yolo/val/')
coco_segmentation_to_yolo('datasets/test_coco.json', 'datasets', 'datasets/yolo/test/')

Image images\IOPA (22673).jpg not found. Skipping...
Image images\IOPA (3265).jpg not found. Skipping...
Image images\IOPA (22788).jpg not found. Skipping...
Image IOP's/lopa_1_jpg_quality/IOPA (22038).jpg not found. Skipping...
Image images\IOPA (25830).jpg not found. Skipping...
Image images\year (2142).jpg not found. Skipping...
Image images\IOPA (7815).jpg not found. Skipping...
Image images\IOPA (27255).jpg not found. Skipping...
Image images\IOPA (20879).jpg not found. Skipping...
Image images\IOPA (21151).jpg not found. Skipping...
YOLO segmentation format annotations saved in: datasets/yolo/train/labels
Images copied to: datasets/yolo/train/images
Filtered COCO dataset saved as: datasets/yolo/train/filtered_coco_dataset.json
Image images\IOPA (24024).jpg not found. Skipping...
YOLO segmentation format annotations saved in: datasets/yolo/val/labels
Images copied to: datasets/yolo/val/images
Filtered COCO dataset saved as: datasets/yolo/val/filtered_coco_dataset.json
Image images