# Lilly IPO Shard Runner (Colab)

This notebook runs a single shard of `build_ipo_pairs.py` and saves output to Google Drive.

**Before running:**
- Set `SHARD_ID` (00..07).
- Decide how you want to transfer the repo + artifacts to Colab (Git clone or Drive).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change this to your preferred Drive folder
OUT_DIR = '/content/drive/MyDrive/lilly_ipo'
!mkdir -p "{OUT_DIR}"

In [None]:
# Download repo (and shards) from GitHub if not already present
import os
REPO_URL = 'https://github.com/rmulligan/lilly-ipo-colab.git'
REPO_DIR = '/content/lilly-ipo-colab'
if not os.path.isdir(REPO_DIR):
    !git clone "{REPO_URL}" "{REPO_DIR}"

print('Repo dir:', REPO_DIR)


In [None]:
# Verify adapter path exists
import os
ADAPTER_PATH = '/content/drive/MyDrive/lilly_repo/experiments/lilly_subjective_32b/final_lilly_32b'
print('Adapter exists:', os.path.isdir(ADAPTER_PATH))
print('Adapter contents:')
!ls -la "{ADAPTER_PATH}" | head -n 20


In [None]:
# Sync adapter from Drive into repo (if missing)
import os, shutil
DRIVE_ADAPTER = '/content/drive/MyDrive/lilly_repo/experiments/lilly_subjective_32b/final_lilly_32b'
REPO_DIR = '/content/lilly-ipo-colab'
REPO_ADAPTER = REPO_DIR + '/experiments/lilly_subjective_32b/final_lilly_32b'
if os.path.isdir(DRIVE_ADAPTER):
    os.makedirs(os.path.dirname(REPO_ADAPTER), exist_ok=True)
    if not os.path.isdir(REPO_ADAPTER):
        print('Copying adapter into repo...')
        shutil.copytree(DRIVE_ADAPTER, REPO_ADAPTER)
    else:
        print('Repo adapter already exists, skipping copy.')
else:
    print('Drive adapter not found; please check path:', DRIVE_ADAPTER)


In [None]:
# Verify selected shard exists
import os
SHARD_ID = '00'  # set before running
SHARD_PATH = f'/content/lilly-ipo-colab/experiments/affect_ipo/shards/prompts_shard_{SHARD_ID}.json'
print('Shard path:', SHARD_PATH)
print('Shard exists:', os.path.isfile(SHARD_PATH))
!ls -la "{SHARD_PATH}"


In [None]:
# Run shard and save to Drive with timestamp
import time
ts = time.strftime('%Y%m%d_%H%M%S')
OUT_PATH = f'/content/drive/MyDrive/lilly_ipo/pairs_shard_{SHARD_ID}_{ts}.jsonl'
print('Writing to:', OUT_PATH)
!python "{REPO_DIR}/scripts/build_ipo_pairs.py" \
  --input "{REPO_DIR}/experiments/affect_ipo/shards/prompts_shard_{SHARD_ID}.json" \
  --output "{OUT_PATH}" \
  --adapter-path "{REPO_DIR}/experiments/lilly_subjective_32b/final_lilly_32b" \
  --probe-path "{REPO_DIR}/experiments/egist_32b/unified_lilly_probe.pt" \
  --dimensions-path "{REPO_DIR}/experiments/egist_32b/unified_dimensions.json" \
  --classifier-path "{REPO_DIR}/experiments/lilly_subjective_32b/affect_classifier.pt" \
  --num-candidates 4 \
  --max-new-tokens 160 \
  --max-samples 100

In [None]:
# Optional: run all shards sequentially in one session
import time
ALL_SHARDS = [f'{i:02d}' for i in range(8)]
for sid in ALL_SHARDS:
    ts = time.strftime('%Y%m%d_%H%M%S')
    out_path = f'/content/drive/MyDrive/lilly_ipo/pairs_shard_{sid}_{ts}.jsonl'
    print('Running shard', sid, '->', out_path)
    !python "{REPO_DIR}/scripts/build_ipo_pairs.py" \
      --input "{REPO_DIR}/experiments/affect_ipo/shards/prompts_shard_{sid}.json" \
      --output "{out_path}" \
      --adapter-path "{REPO_DIR}/experiments/lilly_subjective_32b/final_lilly_32b" \
      --probe-path "{REPO_DIR}/experiments/egist_32b/unified_lilly_probe.pt" \
      --dimensions-path "{REPO_DIR}/experiments/egist_32b/unified_dimensions.json" \
      --classifier-path "{REPO_DIR}/experiments/lilly_subjective_32b/affect_classifier.pt" \
      --num-candidates 4 \
      --max-new-tokens 160 \
      --max-samples 100

## Option A: Git clone repo
If your repo is accessible via git (public or with credentials), clone it below.
Otherwise use Option B to upload a zip or use Drive.

In [None]:
# REPLACE with your repo URL
REPO_URL = 'YOUR_REPO_URL_HERE'
REPO_DIR = '/content/lilly'
# Uncomment if using git
# !git clone "{REPO_URL}" "{REPO_DIR}"

## Option B: Use Drive to provide artifacts
Place your repo or just the needed files in Drive and set `REPO_DIR` accordingly.
Required files:
- scripts/build_ipo_pairs.py
- experiments/affect_ipo/shards/prompts_shard_XX.json
- experiments/egist_32b/unified_lilly_probe.pt
- experiments/egist_32b/unified_dimensions.json
- experiments/lilly_subjective_32b/affect_classifier.pt
- experiments/lilly_subjective_32b/final_lilly_32b (adapter dir)
- Adapter already uploaded to Drive at: `/content/drive/MyDrive/lilly_repo/experiments/lilly_subjective_32b/final_lilly_32b`


In [None]:
# If using Drive, set REPO_DIR to the folder containing the repo/artifacts
REPO_DIR = '/content/drive/MyDrive/lilly_repo'  # change as needed

In [None]:
!pip -q install transformers accelerate bitsandbytes peft

## (Optional) Hugging Face login
Only needed if you use gated/private models.

In [None]:
# from huggingface_hub import login
# login()

In [None]:
# Merge all shard outputs on Drive into a single JSONL
import glob, os
from pathlib import Path
OUT_DIR = Path('/content/drive/MyDrive/lilly_ipo')
files = sorted(OUT_DIR.glob('pairs_shard_*.jsonl'))
print('Found', len(files), 'shard files')
merged_path = OUT_DIR / 'pairs_merged.jsonl'
with merged_path.open('w') as out:
    for f in files:
        for line in f.read_text().splitlines():
            if line.strip():
                out.write(line + '\n')
print('Merged ->', merged_path)


## Download merged output
In Colab, you can download the merged file directly:
```python
from google.colab import files
files.download('/content/drive/MyDrive/lilly_ipo/pairs_merged.jsonl')
```
Or sync it from Drive to your local machine.


## Done
Output will be saved to Google Drive at:
`/content/drive/MyDrive/lilly_ipo/pairs_shard_XX.jsonl`