## 0. Connect Google Drive

In [15]:
import os
from google.colab import drive
drive.mount('/gdrive')

!ls /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
MyDrive


## 1. Download the Dataset
Here we will use the `LaboroTomato` Dataset from the [laboro.ai](laboro.ai). For more details checkout their [github](https://github.com/laboroai/LaboroTomato?tab=readme-ov-file).**bold text**
```
name: tomato_mixed
images: 643 train, 161 test
cls_num: 6
cls_names: b_fully_ripened, b_half_ripened, b_green, l_fully_ripened, l_half_ripened, l_green
total_bboxes: train[7781], test[1,996]
bboxes_per_class:
    *Train: b_fully_ripened[348], b_half_ripened[520], b_green[1467],
            l_fully_ripened[982], l_half_ripened[797], l_green[3667]
    *Test:  b_fully_ripened[72], b_half_ripened[116], b_green[387],
            l_fully_ripened[269], l_half_ripened[223], l_green[929]
image_resolutions: 3024x4032, 3120x4160
```

In [None]:
# Download & extract dataset in Colab

# If dataset is from URL
!wget -q -O laboro_tomato.zip "http://assets.laboro.ai.s3.amazonaws.com/laborotomato/laboro_tomato.zip"
!unzip -q laboro_tomato.zip -d laboro_tomato
!rm laboro_tomato.zip

print("Dataset extracted to ./laboro_tomato")
# If from Google Drive
# from google.colab import drive; drive.mount('/content/drive')
# !unzip -q "/content/drive/MyDrive/path_to_dataset.zip" -d {dataset_dir}

print(f"Dataset ready at laboro_tomato")


Dataset extracted to ./laboro_tomato
Dataset ready at laboro_tomato


## 2. Create a new directory in your google drive

In [16]:
# Create new directory for reformatted dataset
output_dir = '/gdrive/MyDrive/Datasets/laboro_tomato_yolov8-seg-format'
os.makedirs(output_dir, exist_ok=True)

print(f"Created directory: {output_dir}")

Created directory: /gdrive/MyDrive/Datasets/laboro_tomato_yolov8-seg-format


In [12]:
!ls /gdrive

MyDrive


## 3. Reformat as according to YOLOv8-seg

In [17]:
import json
import shutil
from pathlib import Path

def coco_to_yolo_seg(coco_json_path, images_dir, output_images_dir, output_labels_dir):
    """
    Convert COCO-format instance segmentation to YOLOv8-seg format and copy images.
    """
    # Load COCO JSON
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)

    # Build ID → filename lookup
    image_lookup = {img["id"]: img for img in coco_data["images"]}

    # Create output dirs
    os.makedirs(output_images_dir, exist_ok=True)
    os.makedirs(output_labels_dir, exist_ok=True)

    # Group annotations by image
    annotations_by_image = {}
    for ann in coco_data["annotations"]:
        img_id = ann["image_id"]
        annotations_by_image.setdefault(img_id, []).append(ann)

    for img_id, anns in annotations_by_image.items():
        img_info = image_lookup[img_id]
        src_img_path = Path(images_dir) / img_info["file_name"]

        if not src_img_path.exists():
            print(f"Warning: Image not found: {src_img_path}")
            continue

        # Copy image to YOLO dataset folder
        dst_img_path = Path(output_images_dir) / src_img_path.name
        shutil.copy2(src_img_path, dst_img_path)

        width = img_info["width"]
        height = img_info["height"]

        # Create label file path
        label_filename = Path(img_info["file_name"]).with_suffix(".txt")
        label_path = Path(output_labels_dir) / label_filename

        with open(label_path, 'w') as lf:
            for ann in anns:
                class_id = ann["category_id"] - 1  # YOLO classes start at 0

                for seg in ann["segmentation"]:
                    norm_coords = []
                    for i in range(0, len(seg), 2):
                        x = seg[i] / width
                        y = seg[i + 1] / height
                        norm_coords.extend([x, y])

                    lf.write(f"{class_id} " + " ".join(f"{v:.6f}" for v in norm_coords) + "\n")

    return coco_data["categories"]  # Return categories for data.yaml creation


def create_data_yaml(categories, dataset_path):
    """
    Create YOLOv8 data.yaml file.
    """
    names = [cat["name"] for cat in categories]
    yaml_content = f"""train: {dataset_path}/train/images
val: {dataset_path}/test/images

nc: {len(names)}
names: {names}
"""
    with open(Path(dataset_path) / "data.yaml", "w") as f:
        f.write(yaml_content)
    print(f"data.yaml created at {Path(dataset_path) / 'data.yaml'}")


if __name__ == "__main__":
    # Original dataset root
    dataset_root = Path("laboro_tomato/laboro_tomato")

    # Output YOLOv8 dataset root
    yolo_dataset_root = Path("/gdrive/MyDrive/Datasets/laboro_tomato_yolov8-seg-format")

    # Train
    categories = coco_to_yolo_seg(
        coco_json_path=dataset_root / "annotations" / "train.json",
        images_dir=dataset_root / "train",
        output_images_dir=yolo_dataset_root / "train" / "images",
        output_labels_dir=yolo_dataset_root / "train" / "labels"
    )

    # Test
    coco_to_yolo_seg(
        coco_json_path=dataset_root / "annotations" / "test.json",
        images_dir=dataset_root / "test",
        output_images_dir=yolo_dataset_root / "test" / "images",
        output_labels_dir=yolo_dataset_root / "test" / "labels"
    )

    # Create data.yaml
    create_data_yaml(categories, yolo_dataset_root)

    print(f"YOLOv8 dataset prepared at: {yolo_dataset_root.resolve()}")


data.yaml created at /gdrive/MyDrive/Datasets/laboro_tomato_yolov8-seg-format/data.yaml
YOLOv8 dataset prepared at: /gdrive/MyDrive/Datasets/laboro_tomato_yolov8-seg-format
