# 03 · Tokenization & Packing

Tokenize canonical BHC records using Unsloth's 4-bit Llama tokenizer and pack sequences.

In [6]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


PROJECT_ROOT: /content/drive/MyDrive/secure-llm-mia
Active run mode: subset - Quick debugging subset (<=2k rows) for lightweight Colab smoke tests.
BHC CSV path: /content/drive/MyDrive/mimic-iv-bhc/mimic-iv-bhc.csv


In [7]:
# Install unsloth if not already installed
!pip install unsloth

import numpy as np

from unsloth import FastLanguageModel

from src.data.loaders import load_canonical

CANONICAL_PATH = ARTIFACTS_DIR / f'canonical_bhc_{RUN_MODE.name}.parquet'
df = load_canonical(CANONICAL_PATH)
if df.empty:
    raise FileNotFoundError('Canonical data missing. Run notebook 01 first.')



In [8]:
MODEL_NAME = os.getenv('UNSLOTH_MODEL_NAME', 'unsloth/Meta-Llama-3.1-8B-bnb-4bit')
MAX_SEQ_LENGTH = 4096
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print('Tokenizer eos_token:', tokenizer.eos_token)


==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Tokenizer eos_token: <|end_of_text|>


In [9]:
def pack_sequences(token_lists, max_length=4096):
    packed_batches = []
    current = []
    length = 0
    for tokens in token_lists:
        if length + len(tokens) > max_length:
            if current:
                packed_batches.append(current)
            current = []
            length = 0
        current.extend(tokens)
        length += len(tokens)
    if current:
        packed_batches.append(current)
    return packed_batches

encoded = tokenizer(df['text'].tolist(), return_attention_mask=False)['input_ids']
packed_sequences = pack_sequences(encoded, max_length=MAX_SEQ_LENGTH)
attention_masks = [[1] * len(seq) for seq in packed_sequences]
print(f'Created {len(packed_sequences)} packed sequences for run mode {RUN_MODE.name}.')

Created 1738 packed sequences for run mode subset.


Export packed shards to Drive (Arrow/Parquet)

In [10]:
from pathlib import Path
import pandas as pd
from datasets import Dataset, Features, Sequence, Value

PACKED_DIR = ARTIFACTS_DIR / 'packed' / RUN_MODE.name
PACKED_DIR.mkdir(parents=True, exist_ok=True)

features = Features({
    'input_ids': Sequence(Value('int32')),
    'attention_mask': Sequence(Value('int8')),
})
packed_dataset = Dataset.from_dict(
    {
        'input_ids': packed_sequences,
        'attention_mask': attention_masks,
    },
    features=features,
)
packed_parquet_path = PACKED_DIR / 'packed_sequences.parquet'
packed_dataset.to_parquet(str(packed_parquet_path))

summary_df = pd.DataFrame({
    'seq_id': range(len(packed_sequences)),
    'token_count': [len(seq) for seq in packed_sequences],
    'run_mode': RUN_MODE.name,
})
summary_path = PACKED_DIR / 'packed_summary.parquet'
summary_df.to_parquet(summary_path, index=False)

print(f'Saved packed dataset to {packed_parquet_path}')
print(f'Saved summary metadata to {summary_path}')

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved packed dataset to /content/drive/MyDrive/secure-llm-mia/artifacts/packed/subset/packed_sequences.parquet
Saved summary metadata to /content/drive/MyDrive/secure-llm-mia/artifacts/packed/subset/packed_summary.parquet
