In [1]:
"""
Cell A_fix — Repair environment to Torch 2.6.0 + CUDA 12.4 and prevent Unsloth from upgrading it.

What this cell does:
1) Uninstalls Unsloth and Torch family to remove the 2.8.0/0.23.0/… mix.
2) Reinstalls the *paired* CUDA 12.4 wheels:
   - torch==2.6.0+cu124
   - torchvision==0.21.0+cu124
   - torchaudio==2.6.0+cu124
3) Reinstalls the training stack known to work with Unsloth CPT:
   transformers==4.56.1, trl==0.23.0, datasets==4.3.0, accelerate>=1.0.1, peft>=0.13.2, bitsandbytes, sentencepiece, protobuf, huggingface_hub, hf_transfer
4) Installs `unsloth` and `unsloth_zoo` **with --no-deps** so they won’t force-upgrade Torch again.
5) Prints versions + CUDA/GPU sanity.

Why this fix:
- Unsloth’s pip installer can pull its preferred Torch/CUDA build unless you pin and block deps.
- Using the matched 2.6.0/cu124 triplet avoids the `torchvision::nms` / `torchaudio` symbol errors.
References: Unsloth pip matrix & notes about version coupling; reports of pip installing different Torch/CUDA via Unsloth.
"""

import subprocess, sys, importlib, platform

def pip(*args):
    print(">", sys.executable, "-m", "pip", *args)
    subprocess.check_call([sys.executable, "-m", "pip", *args])

print("Python:", platform.python_version())

# 1) Remove conflicting bits
for pkg in ["unsloth", "unsloth_zoo", "torch", "torchvision", "torchaudio", "triton"]:
    try:
        pip("uninstall", "-y", pkg)
    except subprocess.CalledProcessError:
        pass

# 2) Torch CUDA 12.4 trio (paired wheels)
pip("install", "-U", "--no-cache-dir",
    "torch==2.6.0+cu124", "torchvision==0.21.0+cu124", "torchaudio==2.6.0+cu124",
    "--index-url", "https://download.pytorch.org/whl/cu124")

# 3) Core training stack (pins aligned for CPT)
pip("install", "-U", "--no-cache-dir",
    "transformers==4.56.1",
    "trl==0.23.0",
    "datasets==4.3.0",            # Unsloth CPT examples recommend this vintage
    "accelerate>=1.0.1",
    "peft>=0.13.2",
    "bitsandbytes",
    "sentencepiece",
    "protobuf>=5.28.3",
    "huggingface_hub>=0.24.6",
    "hf_transfer",
)

# 4) Unsloth without deps (prevents Torch override)
pip("install", "-U", "--no-cache-dir", "--no-deps", "unsloth", "unsloth_zoo")

# 5) Version sanity (import Unsloth FIRST)
import unsloth
import torch, torchvision, torchaudio, transformers, datasets, trl

print("\nVersions:")
print("  torch       :", torch.__version__)
print("  torchvision :", torchvision.__version__)
print("  torchaudio  :", torchaudio.__version__)
print("  transformers:", transformers.__version__)
print("  datasets    :", datasets.__version__)
print("  trl         :", trl.__version__)
print("  unsloth     :", getattr(unsloth, '__version__', 'git'))

print("\nCUDA available:", torch.cuda.is_available(), "| GPU:", (torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"))
print("\n✅ Repair complete. Now do: Runtime → Restart runtime… then I’ll give you Cell B.")


Python: 3.12.12
> /usr/bin/python3 -m pip uninstall -y unsloth
> /usr/bin/python3 -m pip uninstall -y unsloth_zoo
> /usr/bin/python3 -m pip uninstall -y torch
> /usr/bin/python3 -m pip uninstall -y torchvision
> /usr/bin/python3 -m pip uninstall -y torchaudio
> /usr/bin/python3 -m pip uninstall -y triton
> /usr/bin/python3 -m pip install -U --no-cache-dir torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
> /usr/bin/python3 -m pip install -U --no-cache-dir transformers==4.56.1 trl==0.23.0 datasets==4.3.0 accelerate>=1.0.1 peft>=0.13.2 bitsandbytes sentencepiece protobuf>=5.28.3 huggingface_hub>=0.24.6 hf_transfer
> /usr/bin/python3 -m pip install -U --no-cache-dir --no-deps unsloth unsloth_zoo
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!

Versions:
  torch       : 2.6.0+cu124
  torchvision : 0.21.0+cu124
  torchaudio  : 2.6.0+

In [1]:
"""
Cell B — Imports & sanity for Continued Pretraining (CPT)

What this cell does:
1) Imports `unsloth` FIRST so its optimizations/patches apply correctly.
2) Imports Torch, Transformers, Datasets, and verifies CUDA/GPU availability.
3) Verifies Unsloth CPT knobs exist:
   - `UnslothTrainer` and `UnslothTrainingArguments`
   - `embedding_learning_rate` attribute (for smaller LR on `lm_head`/`embed_tokens`)
4) Prints a concise environment summary.

Why this matters:
- Unsloth CPT recommends LoRA targets that include `lm_head` and `embed_tokens`, with a 2–10× smaller
  `embedding_learning_rate` than the main `learning_rate`. We’ll rely on these in later cells.
  (See Unsloth CPT page & blog on decoupled LRs.)
"""

import os, platform
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1) Import Unsloth FIRST
import unsloth
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments

# 2) Core libs
import torch, transformers, datasets

# 3) Basic env readout
print(f"Python       : {platform.python_version()}")
print(f"Torch        : {torch.__version__} | CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU          : {torch.cuda.get_device_name(0)}")
print(f"Transformers : {transformers.__version__}")
print(f"Datasets     : {datasets.__version__}")
print(f"Unsloth      : {getattr(unsloth, '__version__', 'git')}")

# 4) Verify CPT bits exist
has_emb_lr = hasattr(UnslothTrainingArguments, "embedding_learning_rate")
print("Has UnslothTrainer            :", callable(UnslothTrainer))
print("Has UnslothTrainingArguments  :", hasattr(unsloth, "UnslothTrainingArguments"))
print("Has embedding_learning_rate   :", has_emb_lr)

# 5) Guidance note (for the walkthrough):
print("\nNote: For CPT we’ll LoRA-target attention/MLP plus `lm_head` and `embed_tokens`,")
print("and set `embedding_learning_rate` smaller than `learning_rate` as per Unsloth CPT docs.")

print("\n✅ Imports & sanity checks passed. Next: load a raw-text corpus (`text` column), then tokenize & pack.")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Python       : 3.12.12
Torch        : 2.6.0+cu124 | CUDA available: True
GPU          : Tesla T4
Transformers : 4.56.1
Datasets     : 4.3.0
Unsloth      : 2025.11.2
Has UnslothTrainer            : True
Has UnslothTrainingArguments  : True
Has embedding_learning_rate   : False

Note: For CPT we’ll LoRA-target attention/MLP plus `lm_head` and `embed_tokens`,
and set `embedding_learning_rate` smaller than `learning_rate` as per Unsloth CPT docs.

✅ Imports & sanity checks passed. Next: load a raw-text corpus (`text` column), then tokenize & pack.


In [2]:
"""
Cell C_fix — Load & sanity-check a Parquet Wikipedia slice for CPT

What this cell does:
1) Uses the modern Parquet Wikipedia dataset `wikimedia/wikipedia` (per-language configs like "20231101.is").
2) Falls back to English ("20231101.en") if the target language config isn't available.
3) Shuffles and selects small train/test slices for Colab.
4) Light-cleans whitespace and drops very short rows; asserts we have a `text` column.

Why this fix:
- HF Datasets removed script-based loaders; legacy `wikipedia` now errors.
- `wikimedia/wikipedia` is the Parquet mirror with per-language configs and a ready `text` column.
Refs: HF dataset card + discussions about deprecating script loaders.
"""
import re
from datasets import load_dataset, DatasetDict

# === Choose your target language here (2–3 letter wiki code) ===
LANG_CODE = "is"            # e.g., 'is' for Icelandic; change if you want a different language
WIKI_SNAPSHOT = "20231101"  # matches available Parquet configs on HF
N_TRAIN = 10_000            # keep small for Colab
N_TEST  = 1_000

cfg = f"{WIKI_SNAPSHOT}.{LANG_CODE}"
print(f"Target Parquet Wikipedia config: {cfg}")

def load_wikimedia(cfg: str):
    # Parquet dataset with a single 'train' split per language/date config
    return load_dataset("wikimedia/wikipedia", cfg, split="train")

try:
    ds = load_wikimedia(cfg)
except Exception as e:
    print(f"[Fallback] Could not load '{cfg}' due to:\n  {e}\n")
    fallback_cfg = f"{WIKI_SNAPSHOT}.en"
    print(f"[Fallback] Using '{fallback_cfg}' so the demo proceeds.")
    ds = load_wikimedia(fallback_cfg)

# Shuffle & slice reproducibly for train/test
ds = ds.shuffle(seed=42)
train = ds.select(range(min(N_TRAIN, len(ds))))
test  = ds.select(range(min(N_TEST,  len(ds))))

# Ensure a 'text' column exists; if not, try to adapt gracefully
if "text" not in train.column_names:
    # Some variants could have 'source' instead—rename if present
    if "source" in train.column_names:
        train = train.rename_column("source", "text")
        test  = test.rename_column("source", "text")
    else:
        raise ValueError(f"No 'text' column found. Columns = {train.column_names}")

# Light cleaning for CPT
_ws = re.compile(r"\s+")
def clean(row):
    t = row["text"]
    if not isinstance(t, str):
        t = str(t) if t is not None else ""
    t = _ws.sub(" ", t).strip()
    return {"text": t}

def nontrivial(row):
    # Keep only reasonably sized paragraphs
    return isinstance(row["text"], str) and len(row["text"]) >= 200

train = train.map(clean, desc="Clean train")
test  = test.map(clean,  desc="Clean test")
train = train.filter(nontrivial, desc="Filter train")
test  = test.filter(nontrivial,  desc="Filter test")

print("Columns:", train.column_names)
print("Rows   :", len(train), "(train) |", len(test), "(test)")
print("\nExample 1 (first 400 chars):\n", train[0]["text"][:400])
print("\nExample 2 (first 400 chars):\n", train[1]["text"][:400])
print("\n✅ Raw-text corpus ready for tokenization & packing.")


Target Parquet Wikipedia config: 20231101.is


README.md: 0.00B [00:00, ?B/s]

20231101.is/train-00000-of-00001.parquet:   0%|          | 0.00/52.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/57453 [00:00<?, ? examples/s]

Clean train:   0%|          | 0/10000 [00:00<?, ? examples/s]

Clean test:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter train:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter test:   0%|          | 0/1000 [00:00<?, ? examples/s]

Columns: ['id', 'url', 'title', 'text']
Rows   : 7787 (train) | 787 (test)

Example 1 (first 400 chars):
 Eiríkur Hauksson, kallaður Eric Hawk alþjóðlega, (f. 4. júlí 1959) er íslenskur tónlistarmaður búsettur í Noregi frá 1988. Hann hefur m.a. verið meðlimur í hljómsveitunum Start, Drýsill og Artch en starfar nú sjálfstætt. Árið 1985 urðu tvö lög sem Eiríkur söng, Gaggó Vest og Gull, eftir Gunnar Þórðarson mjög vinsæl og er hann helst þekktur á Íslandi fyrir þau lög ásamt þátttöku sinni í Söngvakeppn

Example 2 (first 400 chars):
 Setningarathöfn sumarólympíuleikanna 2012, sem fékk nafnið The Isles of Wonder, hófst kl. 21:00 á breskum tíma þann 27. júlí 2012 á Ólympíuleikvanginum í London. Óskarverðlaunahafinn Danny Boyle leikstýrði athöfninni en raftónlistarhljómsveitin Underworld leikstýrði tónlistinni. Elísabet 2. Bretadrottning opnaði athöfnina opinberlega. Sumarólympíuleikarnir 2012

✅ Raw-text corpus ready for tokenization & packing.


In [3]:
"""
Cell D_rescue — One-shot rebuild after runtime disconnect:
- Recreate `raw` from wikimedia/wikipedia (Parquet) if missing
- Clean + filter raw text
- Append EOS, tokenize, concatenate, and chunk (CLM packing)
- Produce lm_train/lm_eval and print stats

Why this works:
- HF removed legacy script loaders; we use the Parquet dataset `wikimedia/wikipedia`.
- CLM packing follows the official run_clm.py approach: concatenate tokens, then chunk to fixed length.
Refs:
- Unsloth CPT (continued pretraining on raw `text`, LoRA for lm_head/embed_tokens)
- HF wikimedia/wikipedia dataset card
- HF run_clm.py packing pattern
"""

import re
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

# ---- Config (edit if you want a different language/snapshot) ----
LANG_CODE      = "is"         # target language (e.g., 'is' = Icelandic)
WIKI_SNAPSHOT  = "20231101"   # Parquet snapshot that exists on HF
N_TRAIN        = 10_000       # small slices for Colab
N_TEST         = 1_000
BLOCK_SIZE     = 2048         # if OOM later, lower to 1024

# ---- Step 1: Ensure `raw` exists (load if missing) ----
def load_or_build_raw():
    cfg = f"{WIKI_SNAPSHOT}.{LANG_CODE}"
    print(f"Target Parquet Wikipedia config: {cfg}")
    try:
        ds = load_dataset("wikimedia/wikipedia", cfg, split="train")
    except Exception as e:
        print(f"[Fallback] Could not load '{cfg}' due to:\n  {e}\nUsing '{WIKI_SNAPSHOT}.en' instead.")
        ds = load_dataset("wikimedia/wikipedia", f"{WIKI_SNAPSHOT}.en", split="train")

    ds = ds.shuffle(seed=42)
    train = ds.select(range(min(N_TRAIN, len(ds))))
    test  = ds.select(range(min(N_TEST,  len(ds))))

    # Light clean: collapse whitespace, drop very short rows
    _ws = re.compile(r"\s+")
    def clean(row):
        t = row.get("text", "")
        if not isinstance(t, str):
            t = str(t) if t is not None else ""
        return {"text": _ws.sub(" ", t).strip()}

    def nontrivial(row): return isinstance(row["text"], str) and len(row["text"]) >= 200

    train = train.map(clean, desc="Clean train").filter(nontrivial, desc="Filter train")
    test  = test.map(clean,  desc="Clean test").filter(nontrivial,  desc="Filter test")

    if "text" not in train.column_names:
        raise ValueError(f"No 'text' column found. Columns: {train.column_names}")
    print("Columns:", train.column_names)
    print("Rows   :", len(train), "(train) |", len(test), "(test)")
    return DatasetDict({"train": train, "test": test})

if "raw" not in globals():
    raw = load_or_build_raw()
else:
    # If `raw` already exists, keep it (idempotent).
    print("`raw` already present; skipping reload.")

# ---- Step 2: Tokenizer (match Qwen2.5 we’ll train) + EOS ----
TOKENIZER_NAME = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit"  # tokenizer is same family as the model we’ll load
tok = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
if tok.eos_token is None:
    if tok.pad_token_id is not None:
        tok.eos_token = tok.pad_token
    else:
        tok.add_special_tokens({"eos_token": "</s>"})
print("eos_token_id:", tok.eos_token_id, "| pad_token_id:", tok.pad_token_id)

def add_eos(row):
    t = row["text"]
    if not isinstance(t, str):
        t = str(t) if t is not None else ""
    return {"text": (t.strip() + tok.eos_token)}

raw = DatasetDict({
    "train": raw["train"].map(add_eos, desc="Add EOS (train)"),
    "test" : raw["test"].map(add_eos,  desc="Add EOS (test)"),
})

# ---- Step 3: Tokenize with proper column removal ----
def tok_fn(batch): return tok(batch["text"], add_special_tokens=False)

tok_train = raw["train"].map(
    tok_fn, batched=True, remove_columns=raw["train"].column_names, desc="Tokenize train"
)
tok_test = raw["test"].map(
    tok_fn, batched=True, remove_columns=raw["test"].column_names,  desc="Tokenize test"
)

# ---- Step 4: Concatenate + chunk into fixed-length blocks (CLM packing) ----
def group_texts(examples):
    result = {}
    for key, val in examples.items():
        if not isinstance(val, list) or not val or not isinstance(val[0], list):
            continue
        concatenated = sum(val, [])
        total_len = (len(concatenated) // BLOCK_SIZE) * BLOCK_SIZE
        if total_len == 0:
            continue
        result[key] = [concatenated[i:i+BLOCK_SIZE] for i in range(0, total_len, BLOCK_SIZE)]
    if "input_ids" in result:
        result["labels"] = result["input_ids"].copy()
    return result

lm_train = tok_train.map(group_texts, batched=True, desc="Pack train")
lm_eval  = tok_test.map(group_texts,  batched=True, desc="Pack eval")

# ---- Step 5: Stats ----
def tokens_and_blocks(ds):
    n_blocks = len(ds)
    return n_blocks * BLOCK_SIZE, n_blocks

train_tokens, train_blocks = tokens_and_blocks(lm_train)
eval_tokens,  eval_blocks  = tokens_and_blocks(lm_eval)
print("\nStats:")
print(f"  train: ~{train_tokens:,} tokens | {train_blocks} blocks")
print(f"  eval : ~{eval_tokens:,} tokens | {eval_blocks} blocks")

if len(lm_train):
    print("\nExample train block 0 (first 40 ids):", lm_train[0]["input_ids"][:40])
if len(lm_eval):
    print("Example eval  block 0 (first 40 ids):", lm_eval[0]["input_ids"][:40])

print("\n✅ D_rescue complete. You can now run Cell E (model+LoRA) and Cell F_fix (trainer).")


Target Parquet Wikipedia config: 20231101.is


Clean train:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter train:   0%|          | 0/10000 [00:00<?, ? examples/s]

Clean test:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter test:   0%|          | 0/1000 [00:00<?, ? examples/s]

Columns: ['id', 'url', 'title', 'text']
Rows   : 7787 (train) | 787 (test)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

eos_token_id: 151645 | pad_token_id: 151654


Add EOS (train):   0%|          | 0/7787 [00:00<?, ? examples/s]

Add EOS (test):   0%|          | 0/787 [00:00<?, ? examples/s]

Tokenize train:   0%|          | 0/7787 [00:00<?, ? examples/s]

Tokenize test:   0%|          | 0/787 [00:00<?, ? examples/s]

Pack train:   0%|          | 0/7787 [00:00<?, ? examples/s]

Pack eval:   0%|          | 0/787 [00:00<?, ? examples/s]


Stats:
  train: ~5,662,720 tokens | 2765 blocks
  eval : ~622,592 tokens | 304 blocks

Example train block 0 (first 40 ids): [36, 404, 2426, 68317, 472, 2863, 74, 29931, 11, 595, 20905, 67700, 324, 16247, 55323, 452, 134629, 73, 1794, 67700, 1937, 64, 11, 320, 69, 13, 220, 19, 13, 502, 6654, 128910, 220, 16, 24, 20, 24, 8, 2714, 40136]
Example eval  block 0 (first 40 ids): [36, 404, 2426, 68317, 472, 2863, 74, 29931, 11, 595, 20905, 67700, 324, 16247, 55323, 452, 134629, 73, 1794, 67700, 1937, 64, 11, 320, 69, 13, 220, 19, 13, 502, 6654, 128910, 220, 16, 24, 20, 24, 8, 2714, 40136]

✅ D_rescue complete. You can now run Cell E (model+LoRA) and Cell F_fix (trainer).


In [4]:
"""
Cell E — Load a 4-bit Qwen2.5 model and apply LoRA for Continued Pretraining (CPT)

What this cell does:
1) Loads a memory-efficient 4-bit model with Unsloth’s FastLanguageModel.
2) Applies LoRA to attention/MLP **plus** `lm_head` and `embed_tokens`.
   (Per Unsloth CPT, these heads/embeddings must be trainable with a *smaller*
   LR; we’ll set that in the next cell via `embedding_learning_rate`.)
3) Enables Unsloth gradient checkpointing for T4 VRAM savings.

Why this is correct:
- Unsloth’s CPT docs & blog show that including `lm_head`/`embed_tokens`
  and using a 2–10× smaller LR for embeddings materially improves language
  adaptation; disabling them performs worse. (We’ll wire the LR in Cell F.)
"""

from unsloth import FastLanguageModel

# Keep model family consistent with the tokenizer you used in Cell D.
MODEL_NAME     = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit"   # switch to base: "unsloth/Qwen2.5-1.5B-bnb-4bit" if you prefer
MAX_SEQ_LENGTH = 2048
LORA_RANK      = 16
LORA_ALPHA     = 16
LORA_DROPOUT   = 0.05

print(f"Loading 4-bit model: {MODEL_NAME}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = MODEL_NAME,
    max_seq_length   = MAX_SEQ_LENGTH,
    load_in_4bit     = True,
    dtype            = None,        # let Unsloth pick the best (fp16 on T4)
    trust_remote_code= True,
)

LORA_TARGETS = [
    "q_proj","k_proj","v_proj","o_proj",
    "gate_proj","up_proj","down_proj",
    "lm_head","embed_tokens",        # CPT-critical for new-language learning
]

print("Applying LoRA (including lm_head & embed_tokens)…")
model = FastLanguageModel.get_peft_model(
    model,
    r                         = LORA_RANK,
    lora_alpha                = LORA_ALPHA,
    lora_dropout              = LORA_DROPOUT,
    target_modules            = LORA_TARGETS,
    bias                      = "none",
    use_gradient_checkpointing= "unsloth",
    random_state              = 3407,
    use_rslora                = False,
    loftq_config              = None,
)

# Tiny smoke check: print trainable param count
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / Total: {total:,} (~{trainable/total:.2%} trainable)")

print("\n✅ Model & LoRA ready. Next: build UnslothTrainer with a smaller `embedding_learning_rate` and start a short CPT run.")


Loading 4-bit model: unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.11.2: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Applying LoRA (including lm_head & embed_tokens)…
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.11.2 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM
Trainable params: 485,212,160 / Total: 1,607,202,304 (~30.19% trainable)

✅ Model & LoRA ready. Next: build UnslothTrainer with a smaller `embedding_learning_rate` and start a short CPT run.


In [5]:
"""
Cell F_hardened — CPT trainer with 8-bit paged optimizer (prevents Colab disconnects)

What this does
- Keeps your CPT setup (training `embed_tokens` + `lm_head` with a smaller LR)
- Switches optimizer to bitsandbytes **paged AdamW 8-bit** to slash RAM use
- Adds small stability tweaks for Colab T4

Why this helps
- Training embeddings/head makes ~hundreds of millions of params trainable.
- 32-bit AdamW allocates multi-GB state -> Colab kills the runtime late in training.
- 8-bit paged optimizer reduces optimizer-state memory & pages it when needed.
Refs:
- Unsloth CPT recommends updating embeddings/head with smaller LR.
- bitsandbytes 8-bit/paged optimizers drastically reduce memory.
"""
import time
from unsloth import UnslothTrainer, UnslothTrainingArguments
from transformers import default_data_collator

assert "model" in globals() and "tokenizer" in globals(), "Run Cell E first."
assert "lm_train" in globals() and "lm_eval" in globals(), "Run the tokenize/pack cell first."

output_dir = "cpt_qwen25_is_demo"

LR_MAIN  = 5e-5
LR_EMBED = 5e-6  # 10x smaller for embeddings/lm_head per Unsloth CPT

args = UnslothTrainingArguments(
    output_dir                   = output_dir,
    per_device_train_batch_size  = 1,
    per_device_eval_batch_size   = 1,
    gradient_accumulation_steps  = 8,               # effective batch 8
    learning_rate                = LR_MAIN,
    embedding_learning_rate      = LR_EMBED,        # smaller LR for embeddings/head
    optim                        = "paged_adamw_8bit",  # <<< key fix (RAM/CPU safe)
    weight_decay                 = 0.1,
    num_train_epochs             = 1,
    max_steps                    = 100,             # your short demo
    lr_scheduler_type            = "cosine",
    warmup_ratio                 = 0.03,
    logging_steps                = 10,
    eval_strategy                = "steps",
    eval_steps                   = 50,
    save_strategy                = "no",
    report_to                    = "none",
    fp16                         = True,
    bf16                         = False,
    gradient_checkpointing       = True,
    gradient_checkpointing_kwargs= {"use_reentrant": False},
    dataloader_pin_memory        = True,
    dataloader_num_workers       = 0,               # avoid CPU thrash on Colab
    remove_unused_columns        = False,
)

trainer = UnslothTrainer(
    model            = model,
    args             = args,
    train_dataset    = lm_train,
    eval_dataset     = lm_eval.select(range(min(256, len(lm_eval)))),
    data_collator    = default_data_collator,
    processing_class = tokenizer,                   # TRL ≥0.12 rename
)

print("Trainer ready (8-bit paged optimizer). Starting CPT…")
t0 = time.time()
out = trainer.train()
t1 = time.time()
print("Training complete.")
print(out)
print(f"Wall clock (s): {t1 - t0:.1f}")


Trainer ready (8-bit paged optimizer). Starting CPT…


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,765 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 485,212,160 of 2,262,300,160 (21.45% trained)


Step,Training Loss,Validation Loss
50,3.0677,3.023387
100,2.9766,2.95995


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Training complete.
TrainOutput(global_step=100, training_loss=3.1015860557556154, metrics={'train_runtime': 2790.1246, 'train_samples_per_second': 0.287, 'train_steps_per_second': 0.036, 'total_flos': 1.76510019305472e+16, 'train_loss': 3.1015860557556154, 'epoch': 0.28933092224231466})
Wall clock (s): 2792.6


In [6]:
"""
Cell G — Evaluate perplexity (PPL) on the eval blocks

What this cell does:
1) Uses `trainer.evaluate()` on our packed eval dataset (lm_eval subset trainer already holds).
2) Converts `eval_loss` -> perplexity = exp(loss).
3) Prints a compact summary.

Why this is correct:
- For causal LM, HF’s `run_clm.py` uses the same pattern: evaluate to get loss, then PPL = exp(loss).
- We evaluate on packed (concatenate→chunked) blocks, matching the CLM prep you used.
"""
import math, time

assert "trainer" in globals(), "Trainer not found. Re-run Cell F_hardened to rebuild the trainer."

t0 = time.time()
metrics = trainer.evaluate()  # uses the eval_dataset provided when building the trainer
t1 = time.time()

eval_loss = float(metrics.get("eval_loss", float("nan")))
ppl = math.exp(eval_loss) if eval_loss < 20 else float("inf")  # guard against overflow if loss is huge

print("=== Eval summary ===")
print(f"eval_loss        : {eval_loss:.6f}")
print(f"perplexity (PPL) : {ppl:.3f}")
print(f"wall clock (s)   : {t1 - t0:.1f}")

print("\n✅ Perplexity computed. Next cell will: (a) sample target-language generations, and (b) save the LoRA adapters + tokenizer for reuse.")


=== Eval summary ===
eval_loss        : 2.959950
perplexity (PPL) : 19.297
wall clock (s)   : 233.8

✅ Perplexity computed. Next cell will: (a) sample target-language generations, and (b) save the LoRA adapters + tokenizer for reuse.


In [7]:
"""
Cell H — Inference (Icelandic) + Save LoRA adapters & tokenizer

What this cell does:
1) Defines a `chat()` helper that:
   - Uses Qwen2.5’s chat template if available for nicer responses, else falls back to raw prompt.
   - Runs generation with AMP (`torch.amp.autocast('cuda', dtype=torch.float16)`) for speed/memory.
2) Samples two Icelandic prompts to sanity-check the CPT effect.
3) Saves your LoRA adapter weights and tokenizer via `save_pretrained()`.

Why this setup:
- Unsloth CPT recommends updating `lm_head` + `embed_tokens` (with a smaller embedding LR) to adapt to a new language.
- PEFT/Transformers recommend `model.save_pretrained(adapter_dir)` for LoRA adapters; reload later with a base model + PEFT.
Refs:
- Unsloth continued pretraining guidance (include `lm_head`/`embed_tokens`)
- Unsloth inference/saving docs
- PyTorch AMP autocast (modern API)
"""

import os, torch
from pathlib import Path

assert "model" in globals() and "tokenizer" in globals(), "Run Cells E and F first."

model.eval()

# ---- Generation helper (uses chat template if present) ----
def chat_icelandic(prompt_text, max_new_tokens=160, temperature=0.7, top_p=0.9):
    # If tokenizer has a chat template (Qwen2.5 Instruct does), use it; else fall back to plain text.
    if getattr(tokenizer, "chat_template", None):
        messages = [
            {"role": "system", "content": "Þú ert gagnlegur íslenskur aðstoðarmaður."},
            {"role": "user", "content": prompt_text},
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer([text], return_tensors="pt").to(model.device)
    else:
        inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)

    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.float16):
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    # If we used a chat template, strip the prompt part for cleaner display
    if getattr(tokenizer, "chat_template", None):
        decoded = decoded.split(prompt_text, 1)[-1].strip()
    return decoded

# ---- Quick Icelandic samples ----
print("=== Dæmi A (almenn spurning á íslensku) ===")
print(chat_icelandic("Geturðu skrifað stutta samantekt um sögu Reykjavíkur?"))
print("\n=== Dæmi B (texta-framhald á íslensku) ===")
print(chat_icelandic("Ljóðlínur á íslensku um vetur og norðurljós:"))

# ---- Save LoRA adapters + tokenizer ----
save_dir = Path("cpt_qwen25_is_lora")
save_dir.mkdir(parents=True, exist_ok=True)

# Save PEFT adapter (this writes adapter_config.json + adapter_model.safetensors)
model.save_pretrained(save_dir.as_posix())

# Save tokenizer (needed for reload)
tokenizer.save_pretrained(save_dir.as_posix())

print(f"\n✅ Saved LoRA adapter + tokenizer to: {save_dir.resolve()}")
print("Files:", [p.name for p in save_dir.iterdir()])

# Reload tip (commented):
# from peft import PeftModel
# from unsloth import FastLanguageModel
# base_name = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit"
# base_model, tok = FastLanguageModel.from_pretrained(model_name=base_name, max_seq_length=2048, load_in_4bit=True, dtype=None)
# peft_model = PeftModel.from_pretrained(base_model, save_dir.as_posix())
# peft_model.eval()


=== Dæmi A (almenn spurning á íslensku) ===
assistant
Sóttum á þessari tímabil, sem þeirra var sérstakiljandi eftir meðal og hann var lífsmannin. Hann gerðist fram vegna fjölskyldum við að staðsetja þar að það sé nokku til að vera einnig ekki útfrædd af sjálfum. Því getið hann hafði verið átti til daga, en hann hafi verið óskyldingur fyrir nýri tíma. Hann hefur sett upp að að koma til einungis einkennsins við þremur milljar

=== Dæmi B (texta-framhald á íslensku) ===
assistant
Fyrsta stærðfræði, stofnu fyrir íslenska leidda sérstaklega þar sem viðeita eftir nýja vegmanna árið 1980. Þessi ríkisins var skráði út meira en 45 milljónir lína frá haglistum til borgsins. Í þessu tímum hefði verið einnig nafnin á sínum að sjálfsættinni íslenskum landi, en ekki vikin fyrir fjölda íslenskum bókar. Árið

✅ Saved LoRA adapter + tokenizer to: /content/cpt_qwen25_is_lora
Files: ['README.md', 'merges.txt', 'tokenizer.json', 'tokenizer_config.json', 'chat_template.jinja', 'special_tokens_map.json', 'a