# Colab Two-Stage Runner (Severstal -> NEU)
Run top-to-bottom once. After disconnect, rerun only unfinished cells.


In [None]:
"""import sys, subprocess

# Reinstall a compatible scientific stack
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall",
    "numpy==1.26.4",
    "scipy==1.11.4",
    "scikit-learn==1.4.2",
    "pandas==2.2.2",
    "matplotlib==3.8.4",
    "pillow==10.3.0",
    "pyyaml==6.0.1",
    "tqdm==4.66.4",
])

print("Done. Now restart runtime: Runtime > Restart runtime")"""


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
import os
import subprocess
from pathlib import Path

repo = Path('/content/FYP-code')
if repo.exists():
    print('Repo exists, pulling latest...')
    subprocess.check_call(['git', '-C', str(repo), 'pull', '--ff-only'])
else:
    print('Cloning repo...')
    subprocess.check_call(['git', 'clone', 'https://github.com/spinelessknave8/FYP_code.git', str(repo)])

os.chdir(repo)
print('cwd:', os.getcwd())
subprocess.check_call(['git', '-C', str(repo), 'log', '-1', '--oneline'])


In [None]:
import torch
print('torch:', torch.__version__)
print('cuda available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('gpu:', torch.cuda.get_device_name(0))


In [None]:
import yaml
from pathlib import Path

# Prefer explicit paths to avoid discovery failures.
sev = Path('/content/drive/MyDrive/datasets/severstal')
neu = Path('/content/drive/MyDrive/datasets/neu')

print('severstal:', sev, sev.exists())
print('neu:', neu, neu.exists())

if not sev.exists() or not neu.exists():
    raise RuntimeError('Dataset paths not found under /content/drive/MyDrive/datasets')

base = yaml.safe_load(Path('configs/default.yaml').read_text())
base['device'] = 'cuda'
base['severstal']['data_root'] = str(sev)
base['severstal']['train_csv'] = 'train.csv'
base['severstal']['images_dir'] = 'train_images'
base['neu']['data_root'] = str(neu)
base['output_dir'] = '/content/drive/MyDrive/fyp_outputs'

Path('configs/default.colab.yaml').write_text(yaml.safe_dump(base, sort_keys=False))
print('wrote configs/default.colab.yaml')

for s in ['a', 'b', 'c']:
    split_cfg = yaml.safe_load(Path(f'configs/neu_split_{s}.yaml').read_text())
    merged = yaml.safe_load(yaml.safe_dump(base))
    merged.update(split_cfg)
    out = Path(f'configs/neu_split_{s}.colab.yaml')
    out.write_text(yaml.safe_dump(merged, sort_keys=False))
    print('wrote', out)


In [None]:
from pathlib import Path
import yaml

cfg = yaml.safe_load(Path('configs/default.colab.yaml').read_text())
assert Path(cfg['severstal']['data_root']).exists(), cfg['severstal']['data_root']
assert Path(cfg['neu']['data_root']).exists(), cfg['neu']['data_root']
assert (Path(cfg['severstal']['data_root']) / 'train.csv').exists(), 'Missing train.csv'
assert (Path(cfg['severstal']['data_root']) / 'train_images').exists(), 'Missing train_images'
print('sanity checks passed')


In [None]:
import time
from src.pipelines.notebook_entrypoints import run_two_stage_stage1

# Stage 1 runs once; it reuses existing memory/val artifacts if already present.
t = time.time()
run_two_stage_stage1('configs/default.colab.yaml')
print(f'stage 1 done in {time.time()-t:.1f}s')


In [None]:
import time
from src.pipelines.notebook_entrypoints import run_split_pipeline

t = time.time()
run_split_pipeline('configs/neu_split_a.colab.yaml', skip_if_complete=True)
print(f'split A done in {time.time()-t:.1f}s')


In [None]:
import time
from src.pipelines.notebook_entrypoints import run_split_pipeline

t = time.time()
run_split_pipeline('configs/neu_split_b.colab.yaml', skip_if_complete=True)
print(f'split B done in {time.time()-t:.1f}s')


In [None]:
import time
from src.pipelines.notebook_entrypoints import run_split_pipeline

t = time.time()
run_split_pipeline('configs/neu_split_c.colab.yaml', skip_if_complete=True)
print(f'split C done in {time.time()-t:.1f}s')


In [None]:
from pathlib import Path

base = Path('/content/drive/MyDrive/fyp_outputs')
for s in ['split_a', 'split_b', 'split_c']:
    print(
        s,
        'osr:', (base / s / 'osr' / 'metrics.json').exists(),
        'cascade:', (base / s / 'cascade' / 'metrics.json').exists(),
    )


In [None]:
import subprocess, sys

base = '/content/drive/MyDrive/fyp_outputs'
subprocess.check_call([sys.executable, '-m', 'src.pipelines.aggregate_osr', '--output_dir', base])
subprocess.check_call([
    sys.executable,
    '-m',
    'src.pipelines.plot_combined_osr',
    '--output_dir', base,
    '--out_dir', f'{base}/combined',
])
print('aggregate + combined plots done')
