# 00 · Colab Setup

Configure a persistent Drive-backed workspace for temporal fine-tuning and MIA experiments.

## Guardrails
- Use credentialed MIMIC data only on encrypted Drive folders.
- Never sync PHI or credentials back to GitHub.
- Enable config flags that disable raw-text exports before sharing artifacts.

In [1]:
# Mount Drive and declare persistent paths
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive')
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
except Exception as exc:  # pragma: no cover
    print(f'Drive mount skipped or not on Colab: {exc}')
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
print('Drive root:', DRIVE_ROOT)
print('Project root:', PROJECT_ROOT)
print('BHC data directory:', BHC_DATA_DIR)


Mounted at /content/drive
Drive root: /content/drive/MyDrive
Project root: /content/drive/MyDrive/secure-llm-mia
BHC data directory: /content/drive/MyDrive/mimic-iv-bhc


In [2]:
# Clone or update the GitHub repo on Drive
import subprocess

REPO_URL = 'https://github.com/sehajbath/secure-llm-mia.git'
PROJECT_PARENT = PROJECT_ROOT.parent
PROJECT_PARENT.mkdir(parents=True, exist_ok=True)

git_dir = PROJECT_ROOT / '.git'
if not PROJECT_ROOT.exists():
    subprocess.run(['git', 'clone', REPO_URL, str(PROJECT_ROOT)], check=True)
elif not git_dir.exists():
    raise RuntimeError(f'{PROJECT_ROOT} exists but is not a git repo. Clean up or move it before rerunning setup.')
else:
    subprocess.run(['git', '-C', str(PROJECT_ROOT), 'pull'], check=True)

os.chdir(PROJECT_ROOT)
print('Synced repository at', PROJECT_ROOT)


Synced repository at /content/drive/MyDrive/secure-llm-mia


In [3]:
# Ensure persistent directories exist
for path in [PROJECT_ROOT / 'data', PROJECT_ROOT / 'artifacts', PROJECT_ROOT / 'checkpoints']:
    path.mkdir(parents=True, exist_ok=True)
    print('✓', path)

BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
print('✓', BHC_DATA_DIR)


✓ /content/drive/MyDrive/secure-llm-mia/data
✓ /content/drive/MyDrive/secure-llm-mia/artifacts
✓ /content/drive/MyDrive/secure-llm-mia/checkpoints
✓ /content/drive/MyDrive/mimic-iv-bhc


In [4]:
# Display active run mode (subset vs full)
from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('SECURE_LLM_MIA_RUN_MODE =', RUN_MODE.name)
print(RUN_MODE.description)


SECURE_LLM_MIA_RUN_MODE = subset
Quick debugging subset (<=2k rows) for lightweight Colab smoke tests.


In [None]:
# Install Python dependencies
import subprocess

requirements = PROJECT_ROOT / 'env' / 'requirements.txt'
if requirements.exists():
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '-U', '-r', str(requirements)], check=False)
else:
    print('requirements.txt missing; verify repo sync.')


In [5]:
# Authenticate with Hugging Face Hub if needed
from getpass import getpass
try:
    from huggingface_hub import login
except Exception as exc:  # pragma: no cover
    print(f'huggingface_hub unavailable: {exc}')
else:
    token = getpass('Enter Hugging Face token (press ENTER to skip): ')
    if token:
        login(token=token, add_to_git_credential=True)
    else:
        print('Skipping HF login; gated models may be unavailable.')


Enter Hugging Face token (press ENTER to skip): ··········


In [6]:
# Optional: Weights & Biases logging
try:
    import wandb
    enable_wandb = False
    if enable_wandb:
        wandb.login()
        wandb.init(project='secure-llm-mia', config={'notebook': '00_colab_setup'})
    else:
        print('Weights & Biases disabled. Set enable_wandb=True after configuring secrets.')
except Exception as exc:
    print(f'wandb not available: {exc}')


Weights & Biases disabled. Set enable_wandb=True after configuring secrets.


In [7]:
# Inspect GPU resources
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    capability = torch.cuda.get_device_capability(0)
    print(f'CUDA device: {device_name} (cc {capability})')
    print('BF16 support:', torch.cuda.is_bf16_supported())
else:
    print('CUDA not available. Request a GPU runtime (A100/T4) for fine-tuning runs.')


CUDA not available. Request a GPU runtime (A100/T4) for fine-tuning runs.


In [8]:
# Initialize deterministic seeds
from src.utils.seed import set_global_seed
from src.constants import ensure_directories, DATA_CACHE_DIR, ARTIFACT_ROOT

set_global_seed(17)
ensure_directories()
print('Data cache:', DATA_CACHE_DIR)
print('Artifact root:', ARTIFACT_ROOT)


Data cache: /content/drive/MyDrive/secure-llm-mia/data_cache
Artifact root: /content/drive/MyDrive/secure-llm-mia/artifacts
