In [1]:
# --- Local Setup ---
import os
import sys
from pathlib import Path

# Add repository root to path
# Assuming notebook is in notebooks/ and repo root is one level up
repo_root = Path("..").resolve()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

# Change working directory to repo root so config files are found correctly
os.chdir(repo_root)

In [2]:
# setup Environment & Config
import yaml
from src.config import load_settings

# Define local data paths
# User specified ./data/ contains data, models, results, runs, yoloConfig
# We assume this folder is in the repository root
catnip_data_root = repo_root / "catnip-data"
data_dir = catnip_data_root / "data"

# Override data path to point to local synced folder
os.environ["CATNIP_PATHS_DATA"] = str(data_dir)



settings = load_settings()
print(f"Data Root: {settings.paths.data}")
print(f"Manga Dir: {settings.paths.manga_dir}")
print(f"Annotations Dir: {settings.paths.annotations_dir}")



Data Root: catnip-data\data
Manga Dir: catnip-data\data\manga
Annotations Dir: catnip-data\data\annotations


In [3]:
# prepare Dataset for YOLO
from src.training.preparation import safe_symlink

data_root = settings.paths.data
images_link = data_root / "images"
labels_link = data_root / "labels"

if data_root.exists():
    safe_symlink(settings.paths.manga_dir, images_link)
    safe_symlink(settings.paths.annotations_dir, labels_link)
else:
    print(f"Warning: Data root {data_root} does not exist. Please ensure catnip-data is synced.")

Created symlink: catnip-data\data\images -> C:\Users\rifusaki\Desktop\catnip\catnip-data\data\manga
Created symlink: catnip-data\data\labels -> C:\Users\rifusaki\Desktop\catnip\catnip-data\data\annotations


In [None]:
# convert label studio json to yolo format
from src.convert_labels import convert_label_studio_to_yolo

# Update path to local export
# Assuming exports are in the repo's data folder
json_file = repo_root / "catnip-data" / "data" / "ls-exports" / "251227v1.json"

if json_file.exists():
    convert_label_studio_to_yolo(json_file, settings.paths.annotations_dir, { "izutsumi": 0, "izutsumi_face": 1 })
else:
    print(f"Label Studio export not found at {json_file}")

Processed 50 tasks. Labels saved to catnip-data\data\annotations


In [4]:
# generate or load training list based on available labels
from src.training.preparation import generate_training_list

# define config directory mapping to ./catnip-data/yoloConfig/
config_dir = catnip_data_root / "yoloConfig"
config_dir.mkdir(parents=True, exist_ok=True)

# determine filename from input json if available
if 'json_file' in locals() and json_file.exists():
    train_list_name = f"{json_file.stem}_train.txt"
else:
    train_list_name = "train.txt"
    print("json_file not found or defined, defaulting to 'train.txt'")

train_list_path = config_dir / train_list_name

# Set to True to force regeneration of the list (useful if paths changed)
force_regenerate = False 

train_list_path = generate_training_list(images_link, labels_link, train_list_path, force_regenerate=force_regenerate)

json_file not found or defined, defaulting to 'train.txt'
generating new training list: C:\Users\rifusaki\Desktop\catnip\catnip-data\yoloConfig\train.txt
found 2752 total images in 'images' directory.
generated C:\Users\rifusaki\Desktop\catnip\catnip-data\yoloConfig\train.txt
   - labeled images (subset): 66
   - unlabeled images (skipped): 2686


In [5]:
# create dataset.yaml
from src.training.preparation import create_dataset_yaml

dataset_yaml_path = create_dataset_yaml(
    path=data_root,
    train_path=train_list_path,
    val_path=train_list_path,
    names={0: 'izutsumi', 1: 'izutsumi_face'}
)

created dataset.yaml


In [None]:
from ultralytics import YOLO

# load model
model = YOLO("yolo11s.pt") 

# train model
# project points to where runs are saved
project_dir = catnip_data_root / "runs"

results = model.train(
    data=str(dataset_yaml_path),
    epochs=100,
    imgsz=640,
    project=str(project_dir),
    name="izutsumi_v2_local_251227",
    device="cpu", # "mps", "cuda", "cpu"
    cache=False, 
    exist_ok=True
)

Ultralytics 8.3.241  Python-3.13.11 torch-2.6.0 CPU (Intel Core i5-8250U 1.60GHz)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=dataset.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11s.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=izutsumi_v2_local_251227, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plot

In [None]:
# save Model to Bucket Models Directory
from src.training.preparation import save_best_model

project_dir = catnip_data_root / "runs"
target_model_dir = catnip_data_root / "models"

save_best_model(
    project_dir=project_dir,
    run_name="izutsumi_v2_local_251227",
    target_dir=target_model_dir,
    target_name="yolo11_izutsumi_trained.pt"
)

In [None]:
# evaluate
metrics = model.val()
print(metrics) 

In [None]:
# run inference on the entire dataset
from ultralytics import YOLO
from src.output.output import save_inference_results

# load the trained model
project_dir = catnip_data_root / "runs"
model_path = project_dir / "izutsumi_v2_local_251227" / "weights" / "best.pt"
model = YOLO(model_path)

# define output directory mapping to ./catnip-data/results/
results_dir = catnip_data_root / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Use the actual manga directory directly
inference_source = settings.paths.manga_dir

# Create a specific output directory for flattened results
output_dir = results_dir / "inference"
output_dir.mkdir(parents=True, exist_ok=True)

print(f"running inference on {inference_source}...")

# run prediction recursively
results = model.predict(
    source=str(inference_source) + "/**/*.*",  # recursive glob pattern
    project=str(results_dir),
    name="inference",
    save=False,      # Disable auto-save to handle manually
    save_txt=False,  # Disable auto-txt to handle manually
    conf=0.25,      # confidence threshold
    stream=True     # use generator to handle large datasets
)

save_inference_results(results, output_dir, inference_source)