In [None]:
# --- Scripts for Colab ---
# Clone repo
!git clone -b yolo-bbo --recurse-submodules https://github.com/rifusaki/catnip.git
%cd /content/catnip

# Authenticate with Google
from google.colab import auth
auth.authenticate_user()

# Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# Mount bucket
!mkdir -p /content/gcs
!gcsfuse --implicit-dirs catnip-data /content/gcs

# Link bucket to expected data path in pipeline.yaml
# pipeline.yaml expects 'catnip-data/' in the repo root
!ln -s /content/gcs /content/catnip/catnip-data

# Install packages not included in Colab
%pip install ultralytics pydantic pydantic-settings omegaconf

In [None]:
# setup Environment & Config
import sys
import os
from pathlib import Path
import yaml

# Add repository root to path
repo_root = Path("/content/catnip").resolve()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

# Change working directory to repo root so config files are found correctly
os.chdir(repo_root)

from src.config import load_settings

settings = load_settings()
print(f"Data Root: {settings.paths.data}")
print(f"Manga Dir: {settings.paths.manga_dir}")
print(f"Annotations Dir: {settings.paths.annotations_dir}")
print(f"Models Dir: {settings.paths.model_dir}")
print(f"Runs Dir: {settings.paths.runs_dir}")
print(f"Output Dir: {settings.paths.output_dir}")

In [None]:
# prepare Dataset for YOLO
from src.training.preparation import safe_symlink

# Create a local dataset structure for YOLO (images/labels convention)
# We use a local path for this structure to avoid writing to GCS and to ensure symlinks work
dataset_root = repo_root / "yolo_dataset"
dataset_root.mkdir(exist_ok=True)

images_link = dataset_root / "images"
labels_link = dataset_root / "labels"

# Symlink the actual data from GCS (via settings paths) to this local structure
safe_symlink(settings.paths.manga_dir, images_link)
safe_symlink(settings.paths.annotations_dir, labels_link)

In [None]:
# convert label studio json to yolo format
from src.convert_labels import convert_label_studio_to_yolo

# Assuming ls-exports is in the data folder in GCS
json_file = settings.paths.data / "data/ls-exports/251226v1.json"

if json_file.exists():
    convert_label_studio_to_yolo(json_file, settings.paths.annotations_dir, { "izutsumi": 0, "izutsumi_face": 1 })
else:
    print(f"Label Studio export not found at {json_file}")

In [None]:
# generate or load training list based on available labels
from src.training.preparation import generate_training_list

# define config directory mapping to gs://catnip-data/yoloConfig/
config_dir = settings.paths.data / "yoloConfig"
config_dir.mkdir(parents=True, exist_ok=True)

# determine filename from input json if available
if 'json_file' in locals() and json_file.exists():
    train_list_name = f"{json_file.stem}_train.txt"
else:
    train_list_name = "train.txt"
    print("json_file not found or defined, defaulting to 'train.txt'")

train_list_path = config_dir / train_list_name

# Set to True to force regeneration of the list (useful if paths changed)
force_regenerate = True 

# Generate list using the symlinked 'images' directory so paths in txt file match YOLO expectation
train_list_path = generate_training_list(images_link, labels_link, train_list_path, force_regenerate=force_regenerate)

In [None]:
# create dataset.yaml
from src.training.preparation import create_dataset_yaml

dataset_yaml_path = create_dataset_yaml(
    path=dataset_root, # Point to the local dataset root containing images/ and labels/
    train_path=train_list_path,
    val_path=train_list_path,
    names={0: 'izutsumi', 1: 'izutsumi_face'}
)

!cat {dataset_yaml_path}

In [None]:
from ultralytics import YOLO

# load model
model = YOLO("yolo11s.pt") 

# train model
# project points to where runs are saved
project_dir = settings.paths.runs_dir

results = model.train(
    data=str(dataset_yaml_path),
    epochs=100,
    imgsz=settings.params.img_size,
    project=str(project_dir),
    name="izutsumi_v1",
    device="cuda", # "mps", "cuda", "cpu"
    cache=True, # True for Colab
    exist_ok=True
)

In [None]:
# save Model to Bucket Models Directory
from src.training.preparation import save_best_model

project_dir = settings.paths.runs_dir
target_model_dir = settings.paths.model_dir

save_best_model(
    project_dir=project_dir,
    run_name="izutsumi_v1",
    target_dir=target_model_dir,
    target_name="yolo11_izutsumi_trained.pt"
)

In [None]:
# run inference on the entire dataset
from ultralytics import YOLO
from src.output.output import save_inference_results

# load the trained model
model_path = settings.paths.runs_dir / "izutsumi_v1" / "weights" / "best.pt"

if not model_path.exists():
    print(f"Model not found at {model_path}")
else:
    model = YOLO(model_path)

    # define output directory
    results_dir = settings.paths.output_dir
    results_dir.mkdir(parents=True, exist_ok=True)

    # Use the actual manga directory directly
    inference_source = settings.paths.manga_dir

    # Create a specific output directory for flattened results
    output_dir = results_dir / "inference"
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"running inference on {inference_source}...")

    # run prediction recursively
    results = model.predict(
        source=str(inference_source) + "/**/*.*",  # recursive glob pattern
        project=str(results_dir),
        name="inference",
        save=False,      # Disable auto-save to handle manually
        save_txt=False,  # Disable auto-txt to handle manually
        conf=0.25,      # confidence threshold
        stream=True     # use generator to handle large datasets
    )

    save_inference_results(results, output_dir, inference_source)