# 04 Â· Fine-tune Slice *t* (Unsloth QLoRA)

Load 4-bit Llama weights, configure adapters, and respect the 25M token budget per slice.

In [1]:
!pip install unsloth



In [2]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


PROJECT_ROOT: /content/drive/MyDrive/secure-llm-mia
Active run mode: subset - 30k-example subset powering the 4-slice, 3M-token continual fine-tuning regime.
BHC CSV path: /content/drive/MyDrive/mimic-iv-bhc/mimic-iv-bhc.csv


In [3]:
import math
import os
from pathlib import Path
from typing import Dict

import torch
import pandas as pd
from datasets import Dataset, concatenate_datasets

from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer, SFTConfig

from src.modeling.lora import LoRAHyperParams, compute_gradient_accumulation
from src.modeling.train import TokenBudgetTracker

# ---------------------------------------------------------------
# 1. Load sliced TEXT dataset produced in notebook 02
# ---------------------------------------------------------------

SLICED_PATH = ARTIFACTS_DIR / f"sliced_dataset_{RUN_MODE.name}.parquet"
if not SLICED_PATH.exists():
    raise FileNotFoundError(
        f"Missing sliced text dataset {SLICED_PATH}. Run notebook 02 first."
    )

df_sliced = pd.read_parquet(SLICED_PATH)
print("Loaded sliced rows:", len(df_sliced))

# Build slice -> dataset mapping (train only)
slice_datasets: Dict[int, Dataset] = {}
for sid in sorted(df_sliced["slice_id"].unique()):
    slice_df = df_sliced[(df_sliced["slice_id"] == sid) & (df_sliced["split_tag"] == "train")].copy()
    ds = Dataset.from_pandas(slice_df, preserve_index=False)
    if "text" not in ds.column_names:
        raise ValueError("Dataset must contain a `text` column.")
    slice_datasets[int(sid)] = ds
    print(f"Slice {sid}: {len(ds)} training rows loaded (text).")

SLICES = sorted(slice_datasets.keys())
TRACKS = ["noreplay", "replay10"]
REPLAY_FRACTION = 0.10

# ---------------------------------------------------------------
# 2. Compute token-budget-driven step counts
# ---------------------------------------------------------------

MODEL_NAME = os.getenv("UNSLOTH_MODEL_NAME", "unsloth/Llama-3.2-3B-bnb-4bit")
MAX_SEQ_LENGTH = 4096
TOKENS_PER_SLICE = 3_000_000
TOKENS_PER_STEP = 128_000
MICRO_BATCH = 1
AVG_TOKENS_PER_SAMPLE = 3000

accum_steps = compute_gradient_accumulation(
    TOKENS_PER_STEP, MICRO_BATCH, AVG_TOKENS_PER_SAMPLE
)
print("Gradient accumulation:", accum_steps)

max_steps = math.ceil(TOKENS_PER_SLICE / TOKENS_PER_STEP)
print("Max steps per slice:", max_steps)

is_ampere_plus = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
use_bf16 = bool(is_ampere_plus and torch.cuda.is_bf16_supported())
use_fp16 = torch.cuda.is_available() and not use_bf16


# ---------------------------------------------------------------
# 3. Unsloth model initializer
# ---------------------------------------------------------------

def init_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
    )

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.model_max_length = MAX_SEQ_LENGTH

    lora_cfg = LoRAHyperParams(
        r=32,
        alpha=32,
        dropout=0.0,
        target_modules=(
            "q_proj","k_proj","v_proj","o_proj",
            "gate_proj","up_proj","down_proj"
        ),
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=lora_cfg.r,
        target_modules=list(lora_cfg.target_modules),
        lora_alpha=lora_cfg.alpha,
        lora_dropout=lora_cfg.dropout,
        bias="none",
        use_gradient_checkpointing="unsloth",
    )

    FastLanguageModel.for_training(model)
    return model, tokenizer


# ---------------------------------------------------------------
# 4. Training loop: continual finetuning across slices per track
# ---------------------------------------------------------------

for track in TRACKS:
    print(f"=== Track: {track} ===")

    # Initialise base model ONCE per track
    model, tokenizer = init_model()

    for slice_id in SLICES:
        # Base slice data
        base_ds = slice_datasets[slice_id]
        train_ds = base_ds

        # Optional replay: add a fraction of previous slices' data
        if track == "replay10":
            prior_slice_ids = [sid for sid in SLICES if sid < slice_id]
            if prior_slice_ids:
                combined = concatenate_datasets([slice_datasets[p] for p in prior_slice_ids])
                replay_n = min(int(len(base_ds) * REPLAY_FRACTION), len(combined))
                if replay_n > 0:
                    replay_subset = combined.shuffle(seed=17).select(range(replay_n))
                    train_ds = concatenate_datasets([base_ds, replay_subset])
                    print(
                        f"Slice {slice_id}: added {replay_n} replay samples from previous slices."
                    )

        # Directory for this slice's checkpoint
        output_dir = CHECKPOINT_ROOT / f"slice_{slice_id}" / track / RUN_MODE.name
        output_dir.mkdir(parents=True, exist_ok=True)

        # SFT config (one run per slice, continuing from current model weights)
        sft_config = SFTConfig(
            output_dir=str(output_dir),
            per_device_train_batch_size=MICRO_BATCH,
            gradient_accumulation_steps=accum_steps,
            max_seq_length=MAX_SEQ_LENGTH,
            warmup_steps=max(1, int(0.1 * max_steps)),
            max_steps=max_steps,
            learning_rate=5e-5,
            logging_steps=1,
            save_steps=50,
            bf16=use_bf16,
            fp16=use_fp16,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            weight_decay=0.01,
        )

        trainer = SFTTrainer(
            model=model,              # reuse the SAME model object
            tokenizer=tokenizer,
            train_dataset=train_ds,
            args=sft_config,
            dataset_text_field="text",
            packing=False,
            dataset_num_proc=2,
        )

        print(f"--- Training slice {slice_id} ({track}) ---")
        trainer.train()

        # Keep updated weights in `model` for the next slice
        model = trainer.model

        # Approx token accounting (for logging)
        approx_tokens = len(train_ds) * AVG_TOKENS_PER_SAMPLE
        print(f"Slice {slice_id}: approx tokens consumed {approx_tokens:,}")

        # Save LoRA checkpoint for this slice
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Saved adapters + tokenizer to {output_dir}")

        del trainer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # (optional) after all slices for this track:
    del model, tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Loaded sliced rows: 4225
Slice 1: 1020 training rows loaded (text).
Slice 2: 1062 training rows loaded (text).
Slice 3: 1094 training rows loaded (text).
Slice 4: 1049 training rows loaded (text).
Gradient accumulation: 43
Max steps per slice: 24
=== Track: noreplay ===
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1020 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 1 (noreplay) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,020 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)
[34m[1mwandb[0m: Currently logged in as: [33msehajbath[0m ([33msehajbath-toronto-metropolitan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1286
2,2.0792
3,2.1614
4,2.0104
5,2.1252
6,2.1162
7,2.0669
8,2.0848
9,2.0664
10,2.0165


Slice 1: approx tokens consumed 3,060,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_1/noreplay/subset


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1062 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 2 (noreplay) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,062 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,1.9735
2,1.9737
3,1.9555
4,1.9548
5,1.9037
6,1.8836
7,1.8706
8,1.8354
9,1.8784
10,1.8665


Slice 2: approx tokens consumed 3,186,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_2/noreplay/subset


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1094 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 3 (noreplay) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,094 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,1.7836
2,1.7756
3,1.8381
4,1.8144
5,1.7649
6,1.7707
7,1.7376
8,1.8279
9,1.8022
10,1.7033


Slice 3: approx tokens consumed 3,282,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_3/noreplay/subset


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1049 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 4 (noreplay) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,049 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,1.7443
2,1.7006
3,1.6661
4,1.7439
5,1.7354
6,1.738
7,1.6929
8,1.6935
9,1.7029
10,1.6191


Slice 4: approx tokens consumed 3,147,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_4/noreplay/subset
=== Track: replay10 ===
==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1020 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 1 (replay10) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,020 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,2.1286
2,2.0792
3,2.1615
4,2.0104
5,2.1252
6,2.1164
7,2.0669
8,2.085
9,2.0664
10,2.0164


Slice 1: approx tokens consumed 3,060,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_1/replay10/subset
Slice 2: added 106 replay samples from previous slices.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1168 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 2 (replay10) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,168 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,2.003
2,1.9473
3,1.9872
4,1.973
5,1.9065
6,1.8768
7,1.8683
8,1.813
9,1.8419
10,1.8815


Slice 2: approx tokens consumed 3,504,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_2/replay10/subset
Slice 3: added 109 replay samples from previous slices.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1203 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 3 (replay10) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,203 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,1.761
2,1.8477
3,1.7973
4,1.7833
5,1.8049
6,1.7821
7,1.7104
8,1.7376
9,1.7424
10,1.7193


Slice 3: approx tokens consumed 3,609,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_3/replay10/subset
Slice 4: added 104 replay samples from previous slices.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/1153 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training slice 4 (replay10) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,153 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 43
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 43 x 1) = 43
 "-____-"     Trainable parameters = 48,627,712 of 3,261,377,536 (1.49% trained)


Step,Training Loss
1,1.7066
2,1.761
3,1.7058
4,1.7139
5,1.687
6,1.6683
7,1.6482
8,1.7129
9,1.6989
10,1.6857


Slice 4: approx tokens consumed 3,459,000
Saved adapters + tokenizer to /content/drive/MyDrive/secure-llm-mia/checkpoints/slice_4/replay10/subset
