In [None]:
# install Dependencies
!pip install ultralytics omegaconf pydantic

In [None]:
# mount GCS Bucket
from google.colab import auth
auth.authenticate_user()

# install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# mount bucket to /content/catnip-data
!mkdir -p /content/catnip-data
!gcsfuse --implicit-dirs catnip-data /content/catnip-data

In [None]:
# setup Environment & Config
import sys
import os
from pathlib import Path
import yaml

# add repository root to path
if "/content/catnip" not in sys.path:
    sys.path.append("/content/catnip")

from src.config import load_settings

# override data path to point to mounted bucket
# we use the 'data' folder inside the bucket as the root for our paths
os.environ["CATNIP_PATHS_DATA"] = "/content/catnip-data/data"

settings = load_settings()
print(f"Data Root: {settings.paths.data}")
print(f"Manga Dir: {settings.paths.manga_dir}")
print(f"Annotations Dir: {settings.paths.annotations_dir}")

In [None]:
# prepare Dataset for YOLO
# my dataset uses 'manga' for images and 'annotations' for labels
# YOLO expects 'images' for images and 'labels' for labels
# we'll create symlinks to satisfy YOLO's expectations:

data_root = settings.paths.data
images_link = data_root / "images"
labels_link = data_root / "labels"

if not images_link.exists():
    # symlink 'manga' to 'images'
    !ln -s "{settings.paths.manga_dir}" "{images_link}"
    print(f"Created symlink: {images_link} -> {settings.paths.manga_dir}")

if not labels_link.exists():
    # symlink 'annotations' to 'labels'
    !ln -s "{settings.paths.annotations_dir}" "{labels_link}"
    print(f"Created symlink: {labels_link} -> {settings.paths.annotations_dir}")


In [None]:
# Optional: Convert Label Studio JSON to YOLO format
# If you uploaded the JSON file to Colab, you can run this cell to generate the 'annotations' folder.
# Otherwise, skip this if you already have the 'annotations' folder in GCS.

import json

def convert_label_studio_to_yolo(json_path, output_dir, class_map):
    """Converts Label Studio JSON export to YOLO format txt files."""
    with open(json_path, 'r') as f:
        tasks = json.load(f)
        
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    count = 0
    for task in tasks:
        url = task['data']['url']
        if "manga/" in url:
            rel_path = url.split("manga/")[-1]
        else:
            continue
            
        image_rel_path = Path(rel_path)
        label_rel_path = image_rel_path.with_suffix('.txt')
        label_full_path = output_dir / label_rel_path
        
        label_full_path.parent.mkdir(parents=True, exist_ok=True)
        
        yolo_lines = []
        if 'annotations' in task:
            for annotation in task['annotations']:
                if 'result' in annotation:
                    for result in annotation['result']:
                        if 'type' in result and result['type'] == 'rectanglelabels':
                            value = result['value']
                            labels = value.get('rectanglelabels', [])
                            if not labels: continue
                            
                            label_name = labels[0]
                            if label_name not in class_map: continue
                                
                            class_id = class_map[label_name]
                            x, y, w, h = value['x'], value['y'], value['width'], value['height']
                            
                            x_center = (x + w / 2) / 100.0
                            y_center = (y + h / 2) / 100.0
                            w_norm = w / 100.0
                            h_norm = h / 100.0
                            
                            yolo_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}")
        
        with open(label_full_path, 'w') as f:
            f.write('\n'.join(yolo_lines))
        count += 1
    print(f"Processed {count} tasks. Labels saved to {output_dir}")


json_file = "/content/json_export.json"
if Path(json_file).exists():
    convert_label_studio_to_yolo(json_file, settings.paths.annotations_dir, { "izutsumi": 0, "izutsumi_face": 1 })

In [None]:
# verify dataset structure & recursion
# since 'manga' (images) has subdirectories (volumes), 'annotations' (labels) must mirror this structure.

image_files = list(images_link.rglob("*.jpg")) + list(images_link.rglob("*.png")) + list(images_link.rglob("*.jpeg"))
print(f"Found {len(image_files)} images recursively.")

missing_labels = []
for img_path in image_files:
    # Construct expected label path
    # 1. Get path relative to 'images' link (e.g., "vol1/page1.jpg")
    rel_path = img_path.relative_to(images_link)
    # 2. Replace extension with .txt (e.g., "vol1/page1.txt")
    label_rel_path = rel_path.with_suffix(".txt")
    # 3. Join with 'labels' link (e.g., ".../labels/vol1/page1.txt")
    label_path = labels_link / label_rel_path
    
    if not label_path.exists():
        missing_labels.append(str(label_rel_path))

if missing_labels:
    print(f"WARNING: {len(missing_labels)} images are missing labels")
    print(f"EG: missing label {missing_labels[0]} e xpected at: {labels_link / missing_labels[0]}")
else:
    print("All images have corresponding labels in the correct subdirectory structure.")

In [None]:
# create dataset.yaml
# define the dataset configuration for YOLO

dataset_yaml = {
    'path': str(data_root),
    'train': 'images',  # relative to 'path'
    'val': 'images',    # using same set for val for now (or split if available)
    
    # Class names
    'names': {
        0: 'izutsumi',
        1: 'izutsumi_face'
    }
}

yaml_path = Path("dataset.yaml")
with open(yaml_path, 'w') as f:
    yaml.dump(dataset_yaml, f)

print(f"Created {yaml_path}")
!cat {yaml_path}

In [None]:
from ultralytics import YOLO

# load model
model = YOLO("yolo11n.pt") 

# train model
# project points to where runs are saved
project_dir = Path("/content/catnip-data/runs")

results = model.train(
    data=str(yaml_path),
    epochs=100,
    imgsz=640,
    project=str(project_dir),
    name="izutsumi_v1",
    exist_ok=True
)

In [None]:
# save Model to Bucket Models Directory
import shutil

# the best model is saved in project_dir/name/weights/best.pt
best_model_path = project_dir / "izutsumi_v1" / "weights" / "best.pt"
target_model_dir = Path("/content/catnip-data/models")
target_model_path = target_model_dir / "yolo11_izutsumi_trained.pt"

if best_model_path.exists():
    target_model_dir.mkdir(parents=True, exist_ok=True)
    shutil.copy(best_model_path, target_model_path)
    print(f"Model saved to {target_model_path}")
else:
    print("Training might have failed, best.pt not found.")