# 03 Â· Tokenization & Packing

Tokenize documents using Unsloth's 4-bit Llama models and pack sequences to respect the 128k tokens/step budget.

In [None]:
# Persistent project setup on Drive
import os
import sys
from pathlib import Path

DRIVE_ROOT = Path('/content/drive')
try:
    from google.colab import drive  # type: ignore
    if not DRIVE_ROOT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if DRIVE_ROOT.exists():
    BASE_ROOT = (DRIVE_ROOT / 'MyDrive').resolve()
else:
    BASE_ROOT = Path.home().resolve()

PROJECT_ROOT = BASE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Clone the repo via 00_colab_setup.ipynb before running this notebook.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

os.chdir(PROJECT_ROOT)
print('PROJECT_ROOT:', PROJECT_ROOT)


In [None]:
import numpy as np
import pandas as pd

from unsloth import FastLanguageModel

from src.data.loaders import load_canonical

CANONICAL_PATH = ARTIFACTS_DIR / 'canonical_demo.parquet'
df = load_canonical(CANONICAL_PATH)
if df.empty:
    raise FileNotFoundError('Canonical data missing. Run notebook 01 first.')


In [None]:
MODEL_NAME = os.getenv('UNSLOTH_MODEL_NAME', 'unsloth/Meta-Llama-3.1-8B-bnb-4bit')
MAX_SEQ_LENGTH = 4096
DTYPE = None
LOAD_IN_4BIT = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
FastLanguageModel.for_inference(model)  # enables efficient generation
print('Tokenizer eos_token:', tokenizer.eos_token)


In [None]:
def pack_sequences(token_lists, max_length=4096):
    packed_batches = []
    current = []
    length = 0
    for tokens in token_lists:
        if length + len(tokens) > max_length:
            packed_batches.append(current)
            current = []
            length = 0
        current.extend(tokens)
        length += len(tokens)
    if current:
        packed_batches.append(current)
    return packed_batches

input_ids = tokenizer(df['text'].tolist(), return_attention_mask=False)['input_ids']
packed = pack_sequences(input_ids, max_length=MAX_SEQ_LENGTH)
print(f'Created {len(packed)} packed sequences.')


Export packed shards to Drive (Arrow/Parquet) once dataset access is configured.