# 01 — Training notebook (MLOps template)\n\nThis notebook keeps *logic in the package* and only orchestrates:\n- dataset download via **kagglehub**\n- manifest build + split\n- model training\n- saving artifacts (checkpoint + labels)\n

In [None]:
# If running locally, install the project in editable mode first:\n# !pip install -e '.[dev]'\n\nimport os\nfrom pathlib import Path\n

In [None]:
import kagglehub\n\n# Download latest version\ndataset_dir = Path(kagglehub.dataset_download(\n    'ninadmehendale/multimodal-iris-fingerprint-biometric-data'\n))\nprint('Path to dataset files:', dataset_dir)\n

In [None]:
# Optional: inspect structure\nfrom itertools import islice\n\npaths = list(islice(dataset_dir.rglob('*'), 50))\nfor p in paths[:30]:\n    print(p)\n

In [None]:
from mmbiometric.data.manifest import build_manifest\nfrom mmbiometric.data.split import split_manifest\n\nrun_dir = Path('runs/notebook_run')\nrun_dir.mkdir(parents=True, exist_ok=True)\n\n# subject_regex may need tweaking based on the dataset's filename/folder naming\nmanifest_path = build_manifest(dataset_dir, run_dir / 'manifest.parquet', subject_regex=r'(\\d+)')\nsplits = split_manifest(manifest_path, out_dir=run_dir / 'splits', val_fraction=0.2, seed=42)\nprint('train manifest:', splits.train_manifest)\nprint('val manifest:', splits.val_manifest)\n

In [None]:
import pandas as pd\ntrain_df = pd.read_parquet(splits.train_manifest)\nlabels = sorted(train_df['subject_id'].astype(str).unique())\nlabel_to_idx = {lab: i for i, lab in enumerate(labels)}\nidx_to_label = {i: lab for lab, i in label_to_idx.items()}\nlen(labels), labels[:10]\n

In [None]:
import torch\nfrom torch.utils.data import DataLoader\n\nfrom mmbiometric.data.dataset import MultimodalBiometricDataset\nfrom mmbiometric.data.transforms import default_image_transform\nfrom mmbiometric.models.multimodal_net import MultimodalNet\nfrom mmbiometric.training.loops import fit\nfrom mmbiometric.utils.seed import seed_everything\n\nseed_everything(42)\n\nimage_size = 224\ntfm = default_image_transform(image_size)\n\ntrain_ds = MultimodalBiometricDataset(splits.train_manifest, tfm, tfm, label_to_idx)\nval_ds = MultimodalBiometricDataset(splits.val_manifest, tfm, tfm, label_to_idx)\n\ntrain_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)\nval_loader = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=2)\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n\nmodel = MultimodalNet(backbone='resnet18', embedding_dim=256, num_classes=len(labels), dropout=0.1)\nmodel.to(device)\n\nres = fit(\n    model=model,\n    train_loader=train_loader,\n    val_loader=val_loader,\n    epochs=3,\n    lr=3e-4,\n    weight_decay=1e-4,\n    device=device,\n    out_dir=run_dir,\n    log_every=20,\n)\n\nprint('best val acc:', res.best_val_acc)\nprint('checkpoint:', res.best_ckpt_path)\n

In [None]:
import json\nlabels_path = run_dir / 'labels.json'\nlabels_path.write_text(json.dumps(idx_to_label, indent=2))\nprint('wrote', labels_path)\n