# 03 Â· Tokenization & Packing

Tokenize documents with the Llama tokenizer and pack them into fixed-length sequences to meet the 128k tokens/step budget.

In [None]:
import os
import sys
from pathlib import Path

PROJECT_ROOT = Path(os.getcwd()).resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from src.data.loaders import load_canonical
from transformers import AutoTokenizer

CANONICAL_PATH = PROJECT_ROOT / 'artifacts' / 'canonical_demo.parquet'
df = load_canonical(CANONICAL_PATH)
if df.empty:
    raise FileNotFoundError('Canonical data missing. Run notebook 01 first.')

In [None]:
try:
    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B', token=os.getenv('HF_TOKEN'), use_fast=True)
    print('Loaded Llama tokenizer.')
except Exception as exc:
    print(f'Falling back to whitespace tokenizer: {exc}')
    class WhitespaceTokenizer:
        model_max_length = 4096
        def __call__(self, texts):
            if isinstance(texts, str):
                texts = [texts]
            input_ids = [[len(word) % 100 + 1 for word in text.split()] for text in texts]
            return {'input_ids': input_ids}
    tokenizer = WhitespaceTokenizer()

In [None]:
def pack_sequences(token_lists, max_length=4096, pad_token_id=0):
    packed_batches = []
    current = []
    length = 0
    for tokens in token_lists:
        if length + len(tokens) > max_length:
            packed_batches.append(current)
            current = []
            length = 0
        current.extend(tokens)
        length += len(tokens)
    if current:
        packed_batches.append(current)
    return packed_batches

tokenized = tokenizer(df['text'].tolist())['input_ids']
packed = pack_sequences(tokenized, max_length=4096)
print(f'Created {len(packed)} packed sequences.')

In [None]:
packed_summary = pd.DataFrame({
    'sequence_id': np.arange(len(packed)),
    'token_count': [len(seq) for seq in packed],
})
packed_summary.head()

ðŸ’¾ **TODO:** export packed data to Arrow/Parquet shards under Drive once dataset access is configured.