In [None]:
# Inference on test split (from processed training data)
from src.predict import predict
pred_out = 'notebooks/outputs/preds_smoke.csv'
Path('notebooks/outputs').mkdir(parents=True, exist_ok=True)
predict(
    config_path='configs/model.yaml',
    checkpoint_path=best_ckpt,
    output_csv=pred_out,
    batch_size_override=4,
    num_workers_override=0,
 )
print('Saved predictions to', pred_out)

In [None]:
# Locate best checkpoint for inference
from pathlib import Path
ckpt_dir = Path('checkpoints')
best_ckpt = None
if ckpt_dir.exists():
    # Prefer a 'best' symlink/path if present, otherwise latest by mtime
    paths = sorted(ckpt_dir.glob('*.ckpt'), key=lambda p: p.stat().st_mtime, reverse=True)
    best_ckpt = str(paths[0]) if paths else None
print('Best checkpoint:', best_ckpt)
assert best_ckpt is not None, 'No checkpoints found after training.'

In [1]:
# Smoke training (1 epoch, 2 train batches, 1 val batch)
from src.train import train
trainer, model, datamodule = train(
    config_path='configs/model.yaml',
    wandb_project='hms-graphs',
    wandb_name='smoke-notebook',
    smoke=True, offline=True,
    limit_train_batches=2, limit_val_batches=1, max_epochs_override=1,
    batch_size_override=2, num_workers_override=0,
)
print('Smoke training done.')

ModuleNotFoundError: No module named 'src'

In [None]:
# Setup & Imports
import os, sys
from pathlib import Path
project_root = Path.cwd()
sys.path.insert(0, str(project_root))
print('Project root:', project_root)

# WANDB offline for smoke runs
os.environ.setdefault('WANDB_MODE', 'offline')
print('WANDB_MODE =', os.environ.get('WANDB_MODE'))

# Quick data check
from pathlib import Path
proc_dir = Path('data/processed')
assert proc_dir.exists(), f'Missing processed data at {proc_dir}. Run preprocessing first.'
patient_files = list(proc_dir.glob('patient_*.pt'))
print('Processed patients:', len(patient_files))
assert len(patient_files) > 0, 'No processed patient files found.'

# HMS Graphs: Smoke Train + Inference
#
# This notebook runs a fast smoke test training and a quick inference pass
# using the processed graph data in `data/processed/`.
# - Training: 1 epoch, 2 train batches, 1 val batch, WANDB offline
# - Inference: runs on the test split from processed training data
#
# After verifying this works, run full training from terminal (see notes below).

In [20]:
# User Settings: pick mode, model type, and config
MODE = "INFERENCE"  # "TRAIN" or "INFERENCE"
MODEL_TYPE = "MLP"  # "MLP" or "GRAPH"

# Training settings (if MODE="TRAIN")
TRAIN_CONFIG = "/kaggle/input/repo-whl-dataset/project/configs/training_mlp.yaml"  # MLP config
# TRAIN_CONFIG = "configs/training_rnn.yaml"  # or Graph config
TRAIN_FOLDS = [0, 1, 2, 3, 4]  # which folds to train; set to [0] for quick test
TRAIN_N_SPLITS = 5  # number of folds for cross-validation

# Inference settings (if MODE="INFERENCE")
INFERENCE_CONFIG = "/kaggle/input/repo-whl-dataset/project/configs/training_mlp.yaml"  # adjust based on checkpoint
# INFERENCE_CONFIG = "configs/inference_merged_small.yaml"  # for graph models
CHECKPOINT_PATH = "/kaggle/input/baseline-model/mlp_baseline_best_fold.ckpt"  # path to .ckpt file
BATCH_SIZE = 32
OUTPUT_CSV = "submission.csv"
WRITE_TO_FILE = True  # False = keep DataFrame in notebook only, True = write CSV file

print(f"Mode: {MODE}")
print(f"Model Type: {MODEL_TYPE}")
if MODE == "TRAIN":
    print(f"  Config: {TRAIN_CONFIG}")
    print(f"  Folds: {TRAIN_FOLDS}")
    print(f"  N Splits: {TRAIN_N_SPLITS}")
else:
    print(f"  Config: {INFERENCE_CONFIG}")
    print(f"  Checkpoint: {CHECKPOINT_PATH}")
    print(f"  Write to file: {WRITE_TO_FILE}")
    if WRITE_TO_FILE:
        print(f"  Output: {OUTPUT_CSV}")

Mode: INFERENCE
Model Type: MLP
  Config: /kaggle/input/repo-whl-dataset/project/configs/training_mlp.yaml
  Checkpoint: /kaggle/input/baseline-model/mlp_baseline_best_fold.ckpt
  Write to file: True
  Output: submission.csv


In [21]:
# Environment setup: local vs Kaggle
import os, sys, subprocess, shutil, glob
from pathlib import Path

IS_KAGGLE = os.path.exists('/kaggle')
IS_LOCAL = not IS_KAGGLE

if IS_KAGGLE:
    print("Running on Kaggle")
    # Always work inside /kaggle/working (writable)
    REPO_ROOT = Path('/kaggle/input/repo-whl-dataset/project')
    WHEELS_DIR = Path('/kaggle/input/complete-deps-dataset/wheels')  # adjust to your wheels dataset
    
    # Helper: find an attached repo dataset under /kaggle/input when Internet is OFF
    def _find_repo_dataset():
        base = Path('/kaggle/input')
        candidates = []
        try:
            for p in base.iterdir():
                # direct repo layout
                if (p / 'setup.py').exists() and (p / 'configs').exists():
                    candidates.append(p)
                # nested under 'project/'
                if (p / 'project' / 'setup.py').exists() and (p / 'project' / 'configs').exists():
                    candidates.append(p / 'project')
        except Exception:
            pass
        # Prefer names that look like ours if multiple
        pref = [c for c in candidates if any(k in str(c).lower() for k in ['hms-kaggle','hms_kaggle','project'])]
        if pref:
            return pref[0]
        return candidates[0] if candidates else None
    
    # If repo not cloned yet and Internet is ON, clone it; otherwise copy from an attached dataset
    if not REPO_ROOT.exists():
        try:
            print("Cloning repo to /kaggle/working ...")
            subprocess.run(['git','clone','--depth','1','--branch','baselines',
                           'https://github.com/denskrlv/HMS-Kaggle.git', str(REPO_ROOT)], check=True)
        except Exception:
            REPO_DATASET = _find_repo_dataset() or Path('/kaggle/input/hms-kaggle-rnn-klloss')
            if REPO_DATASET.exists():
                print(f"Copying repo from attached dataset: {REPO_DATASET} -> {REPO_ROOT}")
                os.makedirs('/kaggle/working', exist_ok=True)
                os.chdir('/kaggle/working')
                shutil.copytree(REPO_DATASET, REPO_ROOT, dirs_exist_ok=True)
            else:
                raise FileNotFoundError("Repo not available; attach repo dataset or enable Internet.")
    
    # Absolute safety guard: never operate with REPO_ROOT inside /kaggle/input (read-only)
    if str(REPO_ROOT).startswith('/kaggle/input'):
        src = REPO_ROOT
        REPO_ROOT = Path('/kaggle/working/project')
        if not REPO_ROOT.exists():
            print(f"Copying project out of read-only to working dir: {src} -> {REPO_ROOT}")
            shutil.copytree(src, REPO_ROOT, dirs_exist_ok=True)
    
    # Install wheels if not already installed
    try:
        import torch_geometric, torcheeg
        print("Dependencies already installed")
    except Exception:
        def pipi(*args): subprocess.run([sys.executable,'-m','pip',*args], check=True)
        # PYG stack
        pipi('install','--no-index','--no-deps','--find-links',str(WHEELS_DIR),
             'pyg_lib','torch_scatter','torch_sparse','torch_cluster','torch_spline_conv','torch_geometric')
        # torcheeg deps + torcheeg
        pipi('install','--no-index','--find-links',str(WHEELS_DIR),
             'spectrum==0.9.0','lmdb>=1.3.0','pywavelets','einops')
        te_whl = sorted(glob.glob(str(WHEELS_DIR / 'torcheeg-1.1.2-*.whl')))
        if te_whl:
            pipi('install','--no-index','--no-deps', te_whl[0])
    
    # SciPy >= 2.0 compat
    import scipy.signal as spsig
    try: _ = spsig.hann
    except AttributeError:
        from scipy.signal import windows as _win
        spsig.hann = _win.hann
    
    # Link competition data into repo layout (under /kaggle/working only)
    K_IN = Path('/kaggle/input/hms-harmful-brain-activity-classification')
    DATA_RAW = REPO_ROOT / 'data' / 'raw'
    DATA_RAW.mkdir(parents=True, exist_ok=True)
    for src, dst in [
        (K_IN/'train.csv', DATA_RAW/'train.csv'),
        (K_IN/'test.csv', DATA_RAW/'test.csv'),
        (K_IN/'train_eegs', DATA_RAW/'train_eegs'),
        (K_IN/'train_spectrograms', DATA_RAW/'train_spectrograms'),
        (K_IN/'test_eegs', DATA_RAW/'test_eegs'),
        (K_IN/'test_spectrograms', DATA_RAW/'test_spectrograms'),
    ]:
        if src.exists() and not dst.exists():
            try: os.symlink(src, dst)
            except Exception: (shutil.copytree if src.is_dir() else shutil.copy2)(src, dst)
    
else:
    print("Running locally")
    # Find repo root (go up from notebooks/ to HMS-Kaggle/)
    current = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
    if current.name == 'notebooks':
        REPO_ROOT = current.parent
    else:
        REPO_ROOT = current
    # Assume local workspace already has dependencies and data

# Put repo on sys.path
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

os.chdir(REPO_ROOT)
print(f"REPO_ROOT = {REPO_ROOT}")
print(f"CWD = {os.getcwd()}")

Running on Kaggle
Dependencies already installed
REPO_ROOT = /kaggle/working/project
CWD = /kaggle/working/project


In [22]:
# Run Training or Inference
import sys
from pathlib import Path
import importlib

if MODE == "TRAIN":
    print(f"Starting training ({MODEL_TYPE})...")
    
    if MODEL_TYPE == "MLP":
        # Call train_mlp with config and n_splits
        import src.train_mlp as tm
        importlib.reload(tm)
        sys.argv = [
            "train_mlp.py",
            "--config", str(TRAIN_CONFIG),
            "--n_splits", str(TRAIN_N_SPLITS),
        ]
        # Optional debug flag: set by user before running
        if "--fast_dev_run" in sys.argv:
            sys.argv.append("--fast_dev_run")
        tm.main()
        
    elif MODEL_TYPE == "GRAPH":
        # Call train_model with config and folds
        import src.train_model as tm
        importlib.reload(tm)
        sys.argv = [
            "train_model.py",
            "--config", str(TRAIN_CONFIG),
            "--folds",
            *[str(f) for f in TRAIN_FOLDS],
        ]
        tm.main()
        
    else:
        raise ValueError(f"Unknown MODEL_TYPE: {MODEL_TYPE}")
    
elif MODE == "INFERENCE":
    print(f"Starting inference ({MODEL_TYPE})...")
    
    # Decide a submission path OUTSIDE the project folder
    # - On Kaggle: /kaggle/working/<OUTPUT_CSV>
    # - Locally: parent of REPO_ROOT, keeping the same filename
    base_out_dir = Path('/kaggle/working') if ('IS_KAGGLE' in globals() and IS_KAGGLE) else Path(REPO_ROOT).parent
    SUBMISSION_PATH = (base_out_dir / Path(OUTPUT_CSV).name).resolve()
    if WRITE_TO_FILE or MODEL_TYPE == "GRAPH":
        print(f"Submission will be written to: {SUBMISSION_PATH}")
    
    if MODEL_TYPE == "MLP":
        # Call predict_mlp with config and checkpoint
        import src.predict_mlp as pm
        importlib.reload(pm)
        argv = [
            "predict_mlp.py",
            "--config", str(INFERENCE_CONFIG),
            "--checkpoint", str(CHECKPOINT_PATH),
            "--batch_size", str(BATCH_SIZE),
        ]
        if WRITE_TO_FILE:
            argv += ["--output", str(SUBMISSION_PATH)]
        sys.argv = argv
        submission_df = pm.main()
        
    elif MODEL_TYPE == "GRAPH":
        # Call predict_model with config and checkpoint (this script always writes a CSV)
        import src.predict_model as pm
        importlib.reload(pm)
        sys.argv = [
            "predict_model.py",
            "--config", str(INFERENCE_CONFIG),
            "--checkpoint", str(CHECKPOINT_PATH),
            "--output", str(SUBMISSION_PATH),
            "--batch_size", str(BATCH_SIZE),
        ]
        submission_df = pm.main()
        # If the script doesn't return a DataFrame, load the file for notebook use
        if not WRITE_TO_FILE:
            import pandas as pd
            try:
                submission_df = pd.read_csv(SUBMISSION_PATH)
            except Exception as e:
                raise RuntimeError(f"Prediction file '{SUBMISSION_PATH}' was not created: {e}")
    
    else:
        raise ValueError(f"Unknown MODEL_TYPE: {MODEL_TYPE}")
    
    if WRITE_TO_FILE:
        print(f"Wrote {SUBMISSION_PATH}")
    else:
        print("Submission DataFrame kept in notebook (variable: submission_df)")
        if submission_df is not None:
            print(f"Shape: {submission_df.shape}")
            print(submission_df.head())
        else:
            print("submission_df is None")
    
else:
    raise ValueError(f"Unknown MODE: {MODE}")

Starting inference (MLP)...
Submission will be written to: /kaggle/working/submission.csv
Loading checkpoint from /kaggle/input/baseline-model/mlp_baseline_best_fold.ckpt
Using device: cpu
Building test inputs from data/raw/test.csv and data/raw/test_eegs ...
Loaded 1 / 1 EEG files
Generated predictions for 1 available samples; 0 missing

Submission preview:
       eeg_id  seizure_vote  lpd_vote  gpd_vote  lrda_vote  grda_vote  \
0  3911565283      0.357359  0.153703  0.017598   0.097441   0.055846   

   other_vote  
0    0.318053  

Shape: (1, 7)
Columns: ['eeg_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

Saved submission to /kaggle/working/submission.csv
Wrote /kaggle/working/submission.csv


In [23]:
# Validate submission (only runs if MODE="INFERENCE")
if MODE == "INFERENCE":
    import pandas as pd
    
    VOTE_KEYS = ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]
    
    # Load submission from file or use the returned DataFrame
    if WRITE_TO_FILE:
        sub = pd.read_csv(SUBMISSION_PATH)
    else:
        sub = submission_df
    
    test = pd.read_csv(REPO_ROOT / "data/raw/test.csv")
    
    print("Submission rows:", len(sub), "| Test rows:", len(test))
    print("Columns:", list(sub.columns))
    print(sub.head(3))
    
    # Basic checks
    assert "eeg_id" in sub.columns, "eeg_id column missing"
    for k in VOTE_KEYS:
        assert k in sub.columns, f"Missing column: {k}"
    
    missing = set(test["eeg_id"]) - set(sub["eeg_id"])
    print("Missing eeg_ids in submission:", len(missing))
    
    if WRITE_TO_FILE:
        print("Submission ready for download at:", SUBMISSION_PATH)
    else:
        print("Submission DataFrame available as 'submission_df' in notebook.")

Submission rows: 1 | Test rows: 1
Columns: ['eeg_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
       eeg_id  seizure_vote  lpd_vote  gpd_vote  lrda_vote  grda_vote  \
0  3911565283      0.357359  0.153703  0.017598   0.097441   0.055846   

   other_vote  
0    0.318053  
Missing eeg_ids in submission: 0
Submission ready for download at: /kaggle/working/submission.csv
