# Two-Stage OSR Runner (Colab / VS Code Colab Kernel)

This notebook runs the current repo pipelines directly:
- Stage 1: `src.pipelines.two_stage.train_patchcore`
- Stage 2 prep: `train_classifier -> extract_embeddings -> run_osr`
- Cascade: `run_cascade`


In [None]:
import os, sys, subprocess
print('python:', sys.executable)
print('cwd:', os.getcwd())
subprocess.run(['nvidia-smi'], check=False)

In [None]:
# Optional: mount Google Drive when running on Colab
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print('Drive mounted at /content/drive')
except Exception as e:
    print('Drive mount skipped:', e)

In [None]:
from pathlib import Path
import subprocess

# If repo is not already present, set this URL and run this cell again.
REPO_URL = ''  # e.g. https://github.com/<user>/<repo>.git
REPO_DIR = Path('/content/FYP-code')

def find_repo_root():
    candidates = [
        Path.cwd(),
        REPO_DIR,
        Path('/content/drive/MyDrive/FYP-code'),
    ]
    for p in candidates:
        if (p / 'configs' / 'default.yaml').exists() and (p / 'src').exists():
            return p
    for p in [Path.cwd(), *Path.cwd().parents]:
        if (p / 'configs' / 'default.yaml').exists() and (p / 'src').exists():
            return p
    return None

repo_root = find_repo_root()
if repo_root is None:
    if not REPO_URL:
        raise RuntimeError('Repo not found. Set REPO_URL in this cell and rerun.')
    if not REPO_DIR.exists():
        subprocess.check_call(['git', 'clone', REPO_URL, str(REPO_DIR)])
    repo_root = REPO_DIR

%cd {repo_root}
print('repo root:', Path.cwd())

In [None]:
import importlib, subprocess, sys
from pathlib import Path

if str(Path.cwd()) not in sys.path:
    sys.path.insert(0, str(Path.cwd()))

def has(mod):
    try:
        importlib.import_module(mod)
        return True
    except Exception:
        return False

core_pkgs = [
    ('numpy', 'numpy<2'),
    ('scipy', 'scipy>=1.10'),
    ('PIL', 'pillow>=9.5'),
    ('sklearn', 'scikit-learn>=1.2'),
    ('matplotlib', 'matplotlib>=3.7'),
    ('tqdm', 'tqdm>=4.65'),
    ('yaml', 'pyyaml>=6.0'),
]
missing = [req for mod, req in core_pkgs if not has(mod)]
if missing:
    print('Installing missing core packages:', missing)
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', *missing])
else:
    print('Core packages already installed')

if not has('torch') or not has('torchvision'):
    print('Installing torch/torchvision')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'torch>=2.0', 'torchvision>=0.15'])
else:
    import torch, torchvision
    print('torch:', torch.__version__, 'torchvision:', torchvision.__version__)

In [None]:
import os
print('severstal exists:', os.path.exists('data/severstal'))
print('neu exists:', os.path.exists('data/neu'))
if not os.path.exists('data/severstal'):
    raise RuntimeError('Missing data/severstal')
if not os.path.exists('data/neu'):
    raise RuntimeError('Missing data/neu')

In [None]:
# Force GPU in a Colab-safe copied config
import yaml
from pathlib import Path

base_cfg = Path('configs/default.yaml')
cfg = yaml.safe_load(base_cfg.read_text())
cfg['device'] = 'cuda'
colab_cfg = Path('configs/default.colab.yaml')
colab_cfg.write_text(yaml.safe_dump(cfg, sort_keys=False))
print('wrote', colab_cfg)

In [None]:
from src.pipelines.notebook_entrypoints import run_two_stage_stage1
run_two_stage_stage1('configs/default.colab.yaml')

In [None]:
from src.pipelines.notebook_entrypoints import run_split_pipeline
run_split_pipeline('configs/neu_split_a.yaml')

In [None]:
from src.pipelines.notebook_entrypoints import run_split_pipeline
run_split_pipeline('configs/neu_split_b.yaml')

In [None]:
from src.pipelines.notebook_entrypoints import run_split_pipeline
run_split_pipeline('configs/neu_split_c.yaml')

In [None]:
import json
from pathlib import Path

for split in ['split_a', 'split_b', 'split_c']:
    p = Path('outputs') / split / 'cascade' / 'metrics.json'
    if not p.exists():
        print(split, 'missing metrics')
        continue
    m = json.loads(p.read_text())
    print(split, {
        'tpr_unknown_system': m.get('tpr_unknown_system'),
        'fpr_known_system': m.get('fpr_known_system'),
        'stage1_pass_rate_known': m.get('stage1_pass_rate_known'),
        'stage1_pass_rate_unknown': m.get('stage1_pass_rate_unknown'),
    })

In [None]:
# Optional: persist outputs to Drive
from pathlib import Path
import shutil

dst = Path('/content/drive/MyDrive/fyp_outputs')
if dst.parent.exists():
    dst.mkdir(parents=True, exist_ok=True)
    shutil.copytree('outputs', dst / 'outputs_latest', dirs_exist_ok=True)
    print('saved outputs to', dst / 'outputs_latest')
else:
    print('Drive not mounted; skipping copy')