## 1. Colab Setup
Run these cells ONLY if you are in Google Colab.

In [None]:
# --- Colab: Initial Setup ---
import sys
import os

# 1. Clone repo (if not already present)
if not os.path.exists("/content/catnip"):
    !git clone -b yolo-bbo --recurse-submodules https://github.com/rifusaki/catnip.git
    %cd /content/catnip
else:
    %cd /content/catnip
    !git pull

# 2. Authenticate with Google
from google.colab import auth
auth.authenticate_user()

# 3. Install gcsfuse
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# 4. Mount bucket
!mkdir -p /content/gcs
!gcsfuse --implicit-dirs catnip-data /content/gcs

# 5. Link bucket to expected data path in pipeline.yaml
# pipeline.yaml expects 'catnip-data/' in the repo root
!ln -s /content/gcs /content/catnip/catnip-data

# 6. Install packages
%pip install ultralytics pydantic pydantic-settings omegaconf

In [None]:
# --- Colab: Path Configuration ---
from pathlib import Path

# Define repo root
repo_root = Path("/content/catnip").resolve()

# Add to sys.path
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

# Change working directory
os.chdir(repo_root)
print(f"Working directory set to: {os.getcwd()}")

## 2. Local Setup
Run these cells ONLY if you are running locally.

In [None]:
# --- Local: Path Configuration ---
import os
import sys
from pathlib import Path

# Add repository root to path
# Assuming notebook is in notebooks/ and repo root is one level up
repo_root = Path("..").resolve()

if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

# Change working directory to repo root so config files are found correctly
os.chdir(repo_root)
print(f"Working directory set to: {os.getcwd()}")

## 3. Shared Logic
The following cells are common to both environments.

In [None]:
# Load Configuration
from src.config import load_settings

settings = load_settings()

print(f"Data Root: {settings.paths.data}")
print(f"Manga Dir: {settings.paths.manga_dir}")
print(f"Annotations Dir: {settings.paths.annotations_dir}")
print(f"Models Dir: {settings.paths.model_dir}")
print(f"Runs Dir: {settings.paths.runs_dir}")
print(f"Output Dir: {settings.paths.output_dir}")

In [None]:
# Prepare Dataset Structure for YOLO
# We create a local 'yolo_dataset' folder with symlinks to the actual data.
# This ensures a standard structure (images/labels) regardless of the source layout.
from src.training.preparation import safe_symlink

dataset_root = repo_root / "yolo_dataset"
dataset_root.mkdir(exist_ok=True)

images_link = dataset_root / "images"
labels_link = dataset_root / "labels"

# Symlink the actual data (from settings) to this local structure
safe_symlink(settings.paths.manga_dir, images_link)
safe_symlink(settings.paths.annotations_dir, labels_link)

In [None]:
# Convert Label Studio Export to YOLO Format
from src.convert_labels import convert_label_studio_to_yolo

# Define path to Label Studio export (adjust filename as needed)
# In Colab/GCS, this is usually in catnip-data/data/ls-exports/
ls_export_dir = settings.paths.data / "data/ls-exports"
json_files = list(ls_export_dir.glob("*.json"))

if json_files:
    # Pick the latest one or specific one
    json_file = sorted(json_files)[-1] 
    print(f"Found Label Studio export: {json_file}")
    
    convert_label_studio_to_yolo(
        json_file, 
        settings.paths.annotations_dir, 
        { "izutsumi": 0, "izutsumi_face": 1, "thistle": 2, "kabru": 3 }
    )
else:
    print(f"No Label Studio JSON export found in {ls_export_dir}")

In [None]:
# Generate Training List
from src.training.preparation import generate_training_list

# Define config directory for YOLO artifacts
config_dir = settings.paths.data / "yoloConfig"
config_dir.mkdir(parents=True, exist_ok=True)

# Determine filename
if 'json_file' in locals() and json_file.exists():
    train_list_name = f"{json_file.stem}_train.txt"
else:
    train_list_name = "train.txt"

train_list_path = config_dir / train_list_name

# Generate list using the symlinked 'images' directory
# This ensures paths in the txt file match what YOLO sees in 'yolo_dataset'
# Note: We use the *symlinked* path for generation so the txt file contains local paths
train_list_path = generate_training_list(
    images_link, 
    labels_link, 
    train_list_path, 
    force_regenerate=True
)

In [None]:
# Create dataset.yaml
from src.training.preparation import create_dataset_yaml

dataset_yaml_path = create_dataset_yaml(
    path=dataset_root, # Point to the local dataset root containing images/ and labels/
    train_path=train_list_path,
    val_path=train_list_path,
    names={0: 'izutsumi', 1: 'izutsumi_face', 2: 'thistle', 3: 'kabru'}
)

# Print content
with open(dataset_yaml_path, 'r') as f:
    print(f.read())

In [None]:
# Train Model
from ultralytics import YOLO

# Load model
model = YOLO("yolo11s.pt") 

# Train
results = model.train(
    data=str(dataset_yaml_path),
    epochs=100,
    imgsz=settings.params.img_size,
    project=str(settings.paths.runs_dir),
    name="izutsumi_v1",
    device="cuda" if settings.params.device == "cuda" else "cpu", # Auto-select based on config/availability
    cache=True,
    exist_ok=True
)

In [None]:
# Save Best Model
from src.training.preparation import save_best_model

save_best_model(
    project_dir=settings.paths.runs_dir,
    run_name="izutsumi_v1",
    target_dir=settings.paths.model_dir,
    target_name="yolo11_izutsumi_trained.pt"
)

In [None]:
# Run Inference
from ultralytics import YOLO
from src.output.output import save_inference_results

# Load the trained model
model_path = settings.paths.runs_dir / "izutsumi_v1" / "weights" / "best.pt"

if not model_path.exists():
    print(f"Model not found at {model_path}")
else:
    model = YOLO(model_path)

    # Define output directories
    results_dir = settings.paths.output_dir
    results_dir.mkdir(parents=True, exist_ok=True)
    
    output_dir = results_dir / "inference"
    output_dir.mkdir(parents=True, exist_ok=True)

    inference_source = settings.paths.manga_dir
    print(f"Running inference on {inference_source}...")

    # Run prediction
    results = model.predict(
        source=str(inference_source) + "/**/*.*",
        project=str(results_dir),
        name="inference",
        save=False,
        save_txt=False,
        conf=0.25,
        stream=True
    )

    save_inference_results(results, output_dir, inference_source)