# 04 Â· Fine-tune Slice *t* (Unsloth QLoRA)

Load 4-bit Llama weights, configure adapters, and respect the 25M token budget per slice.

In [1]:
!pip install unsloth



In [2]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


PROJECT_ROOT: /content/drive/MyDrive/secure-llm-mia
Active run mode: subset - Quick debugging subset (<=2k rows) for lightweight Colab smoke tests.
BHC CSV path: /content/drive/MyDrive/mimic-iv-bhc/mimic-iv-bhc.csv


In [3]:
import math
from pathlib import Path

from unsloth import FastLanguageModel
import torch
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from trl import SFTTrainer, SFTConfig

from src.modeling.lora import LoRAHyperParams, compute_gradient_accumulation
from src.modeling.train import TokenBudgetTracker

SLICE_ID = 1
TRACK = 'noreplay'
MODEL_NAME = os.getenv('UNSLOTH_MODEL_NAME', 'unsloth/Meta-Llama-3.1-8B-bnb-4bit')
MAX_SEQ_LENGTH = 4096
TOKENS_PER_SLICE = 3_000_000
TOKENS_PER_STEP = 256_000
MICRO_BATCH = 4
AVG_TOKENS_PER_SAMPLE = 3_000

packed_path = ARTIFACTS_DIR / 'packed' / RUN_MODE.name / 'packed_sequences.parquet'
if not packed_path.exists():
    raise FileNotFoundError('Packed shards missing. Run notebook 03 to generate them.')

raw_dataset = Dataset.from_parquet(str(packed_path))
print('Loaded packed sequences:', len(raw_dataset))

def clip(record):
    ids = record['input_ids']
    attn = record.get('attention_mask', [1] * len(ids))
    if len(ids) > MAX_SEQ_LENGTH:
        ids = ids[:MAX_SEQ_LENGTH]
        attn = attn[:MAX_SEQ_LENGTH]
    return {
        'input_ids': ids,
        'attention_mask': attn,
        # no labels here
    }

train_dataset = raw_dataset.map(clip, remove_columns=raw_dataset.column_names)
print('Longest seq len after clip:', max(len(seq) for seq in train_dataset['input_ids']))

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
)

tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

lora_cfg = LoRAHyperParams(
    r=32,
    alpha=32,
    dropout=0.0,
    target_modules=('q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'),
)
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_cfg.r,
    target_modules=list(lora_cfg.target_modules),
    lora_alpha=lora_cfg.alpha,
    lora_dropout=lora_cfg.dropout,
    bias='none',
    use_gradient_checkpointing='unsloth',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

accum_steps = compute_gradient_accumulation(TOKENS_PER_STEP, MICRO_BATCH, AVG_TOKENS_PER_SAMPLE)
print('Gradient accumulation:', accum_steps)

output_dir = CHECKPOINT_ROOT / f'slice_{SLICE_ID}' / TRACK / RUN_MODE.name

is_ampere_plus = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
use_bf16 = bool(is_ampere_plus and torch.cuda.is_bf16_supported())
use_fp16 = torch.cuda.is_available() and not use_bf16

# sft_config = SFTConfig(
#     output_dir=str(output_dir),
#     per_device_train_batch_size=MICRO_BATCH,
#     gradient_accumulation_steps=accum_steps,
#     learning_rate=1e-4,
#     warmup_steps=10,
#     max_steps=5,
#     logging_steps=1,
#     save_steps=50,
#     bf16=use_bf16,
#     fp16=use_fp16,
# )

# trainer = SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=train_dataset,
#     args=sft_config,
#     data_collator=data_collator,
# )

steps_per_slice = math.ceil(TOKENS_PER_SLICE / TOKENS_PER_STEP)

training_args = TrainingArguments(
    output_dir=str(output_dir),
    per_device_train_batch_size=MICRO_BATCH,
    gradient_accumulation_steps=accum_steps,
    learning_rate=1e-4,
    warmup_steps=1,
    max_steps=steps_per_slice,
    logging_steps=1,
    save_steps=50,
    bf16=use_bf16,
    fp16=use_fp16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Loaded packed sequences: 1738


Map:   0%|          | 0/1738 [00:00<?, ? examples/s]

Longest seq len after clip: 4096
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Gradient accumulation: 21


In [4]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,738 | Num Epochs = 1 | Total steps = 5
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 21
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 21 x 1) = 84
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
[34m[1mwandb[0m: Currently logged in as: [33msehajbath[0m ([33msehajbath-toronto-metropolitan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.8236
2,1.8949
3,1.8772
4,1.8562
5,1.8156


TrainOutput(global_step=5, training_loss=1.8535318851470948, metrics={'train_runtime': 483.5108, 'train_samples_per_second': 0.869, 'train_steps_per_second': 0.01, 'total_flos': 7.436208874350182e+16, 'train_loss': 1.8535318851470948, 'epoch': 0.2413793103448276})

In [5]:
tracker = TokenBudgetTracker(tokens_per_slice=TOKENS_PER_SLICE)
for seq in train_dataset['input_ids']:
    if tracker.update(len(seq)):
        break
print(f'Approximate tokens registered: {tracker.consumed_tokens:,}')
print(f'Remaining tokens: {tracker.remaining:,}')

Approximate tokens registered: 3,000,887
Remaining tokens: 0


In [7]:
checkpoint_dir = CHECKPOINT_ROOT / f'slice_{SLICE_ID}' / TRACK / RUN_MODE.name
checkpoint_dir.mkdir(parents=True, exist_ok=True)

model.save_pretrained(checkpoint_dir)
tokenizer.save_pretrained(checkpoint_dir)
print('Saved adapters + tokenizer to', checkpoint_dir)

Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_1/noreplay/subset
