In [1]:
import sys, torch
print("Python:", sys.version.split()[0])
print("CUDA available:", torch.cuda.is_available())
!nvidia-smi || echo "No GPU detected — set Runtime → Change runtime type → GPU"
try:
    print("Torch:", torch.__version__, "| Has nn.RMSNorm:", hasattr(torch.nn, "RMSNorm"))
except Exception as e:
    print("Torch import issue:", e)

Python: 3.11.13
CUDA available: True
Mon Aug  4 04:12:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             43W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
           

In [2]:
import os, subprocess, sys

def run(cmd):
    print(">", cmd)
    return subprocess.run(cmd.split(), check=False, text=True, capture_output=True)

need_restart = False

# 1) Ensure a recent CUDA 12.1 PyTorch with nn.RMSNorm available
try:
    import torch
    has_rms = hasattr(torch.nn, "RMSNorm")
    print("Existing Torch:", torch.__version__, "| Has nn.RMSNorm:", has_rms)
except Exception:
    has_rms = False
    print("No working torch yet.")

if not has_rms:
    # Install CUDA 12.1 wheels
    print("Installing/upgrading PyTorch (CUDA 12.1)…")
    print(run("pip install -U --quiet --index-url https://download.pytorch.org/whl/cu121 torch torchvision").stdout)
    need_restart = True

# 2) Core stack (safe to (re)install without restart)
print(run("pip install -U --quiet transformers peft datasets accelerate safetensors sentencepiece").stdout)

# 3) Restart ONLY if PyTorch was (re)installed so new binaries load
if need_restart:
    print("\n🔁 Restarting runtime to load new PyTorch (this is expected). After it reconnects, re-run Cell 1, then continue.")
    import IPython
    IPython.get_ipython().kernel.do_shutdown(True)  # clean restart


Existing Torch: 2.5.1+cu121 | Has nn.RMSNorm: True
> pip install -U --quiet transformers peft datasets accelerate safetensors sentencepiece



In [3]:
import torch, transformers, sys
print("Python:", sys.version.split()[0])
print("Torch:", torch.__version__, "| Has nn.RMSNorm:", hasattr(torch.nn, "RMSNorm"))
print("Transformers:", transformers.__version__)
print("CUDA device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")
assert hasattr(torch.nn, "RMSNorm"), "Torch is too old; go back to Cell 2."

Python: 3.11.13
Torch: 2.5.1+cu121 | Has nn.RMSNorm: True
Transformers: 4.54.1
CUDA device: NVIDIA A100-SXM4-40GB


In [4]:
import os
# Only needed if your BASE model is gated (e.g., meta-llama/*). Otherwise skip.
os.environ["HUGGINGFACE_HUB_TOKEN"] = "hf_xxx_your_token_here"
print("HF token set?", bool(os.environ.get("HUGGINGFACE_HUB_TOKEN")))

HF token set? True


In [5]:
%%writefile /content/train_lora.py
#!/usr/bin/env python3
import os, argparse, math, hashlib, random
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    LlamaForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

def normalize_text(s: str) -> str:
    return " ".join(s.lower().strip().split())

def smart_subsample(jsonl_path, tokenizer, target_minutes, step_time_guess, bsz, grad_accum, epochs, seed=0):
    """
    Returns a tokenized Dataset limited to a size that should train in ~target_minutes.
    Strategy:
      1) Build a clean 'text' column from JSONL chat messages.
      2) Deduplicate by normalized user+assistant text hash (keeps first).
      3) Compute token lengths; bin into quartiles; sample evenly from bins
         (preserves short/medium/long distribution).
      4) Tokenize only the selected subset; drop 'text' column.
    """
    rng = random.Random(seed)
    # Load raw jsonl
    raw = load_dataset("json", data_files=str(jsonl_path), split="train")

    # Build a plain 'text' column using the tokenizer's chat template if available
    def _to_text(example):
        msgs = example["messages"]
        if hasattr(tokenizer, "apply_chat_template"):
            txt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
        else:
            user = next((m["content"] for m in msgs if m["role"] == "user"), "")
            assistant = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
            txt = f"<s>[INST]{user}[/INST]\n{assistant}</s>"
        return {"text": txt}
    raw = raw.map(_to_text, remove_columns=raw.column_names)

    # Deduplicate near-duplicates
    seen = set()
    def _dedup(example):
        key = hashlib.sha1(normalize_text(example["text"]).encode("utf-8")).hexdigest()
        if key in seen:
            return {"keep": 0}
        seen.add(key)
        return {"keep": 1}
    raw = raw.map(_dedup)
    raw = raw.filter(lambda e: e["keep"] == 1).remove_columns(["keep"])

    # Fast length pass (no padding, truncation to cap work)
    def _len_only(example):
        ids = tokenizer(example["text"], add_special_tokens=False, truncation=True, max_length=1024)["input_ids"]
        return {"_len": len(ids)}
    raw = raw.map(_len_only)

    n_total = len(raw)
    # Compute effective batch and steps per epoch we can afford for target_minutes
    eff_batch = bsz * grad_accum
    steps_total_allowed = int(target_minutes * 60.0 / max(step_time_guess, 1e-6))
    steps_per_epoch = max(1, steps_total_allowed // max(epochs, 1))
    n_target = min(n_total, max(eff_batch, steps_per_epoch * eff_batch))

    # Bin into quartiles by length
    # Sort by _len and split into 4 roughly equal bins
    raw_sorted = raw.sort("_len")
    qsize = max(1, len(raw_sorted) // 4)
    bins = [
        raw_sorted.select(range(0, qsize)),
        raw_sorted.select(range(qsize, 2*qsize)),
        raw_sorted.select(range(2*qsize, 3*qsize)),
        raw_sorted.select(range(3*qsize, len(raw_sorted))),
    ]
    # Sample evenly from bins (as even as possible)
    per_bin = [n_target // 4] * 4
    rem = n_target - sum(per_bin)
    for i in range(rem): per_bin[i % 4] += 1

    # Random indices from each bin
    selected = []
    for b, k in zip(bins, per_bin):
        idxs = list(range(len(b)))
        rng.shuffle(idxs)
        idxs = idxs[:k]
        # Map local bin indices back to global raw_sorted indices
        base = b._indices if hasattr(b, "_indices") else None
        # b is a SelectDataset; compute global indices:
        # We'll use the original positions from raw_sorted via .select
        selected.extend([b[i] for i in idxs])  # materialize examples (ok for ~10k)

    # Create a new Dataset from selected examples
    from datasets import Dataset
    sampled = Dataset.from_list(selected)

    # Final tokenize with padding to multiple of 16, drop 'text'
    tokenized = sampled.map(
        lambda ex: tokenizer(ex["text"], truncation=True, max_length=1024),
        batched=True,
        remove_columns=["text"],
    )
    # Report planning numbers
    est_steps = math.ceil(len(tokenized) / eff_batch) * max(epochs, 1)
    est_minutes = est_steps * step_time_guess / 60.0
    print(f"[Sampler] total_raw={n_total}  kept_dedup={len(raw)}  target={len(tokenized)}")
    print(f"[Sampler] eff_batch={eff_batch}  epochs={epochs}  est_steps={est_steps}  est_time≈{est_minutes:.1f} min")
    return tokenized

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--base", default="meta-llama/Llama-3.2-3B-Instruct")
    ap.add_argument("--data", required=True)
    ap.add_argument("--out", required=True)
    ap.add_argument("--epochs", type=int, default=1)          # 1 epoch for a 10-min pass
    ap.add_argument("--lr", type=float, default=2e-4)
    ap.add_argument("--bsz", type=int, default=4)             # good on A100
    ap.add_argument("--grad_accum", type=int, default=4)      # eff batch 16
    ap.add_argument("--target_minutes", type=float, default=10.0)
    ap.add_argument("--step_time_guess", type=float, default=1.1)  # s/step on A100 with LoRA
    ap.add_argument("--seed", type=int, default=0)
    args = ap.parse_args()

    if not torch.cuda.is_available():
        raise RuntimeError("CUDA not available. Set Runtime → Change runtime type → GPU.")

    # A100 fast-path
    device_name = torch.cuda.get_device_name(0)
    major_cc = torch.cuda.get_device_capability()[0]
    use_bf16 = major_cc >= 8
    print(f"[Device] {device_name}  bf16={use_bf16}")

    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    tok = AutoTokenizer.from_pretrained(args.base, use_fast=False, token=os.environ.get("HUGGINGFACE_HUB_TOKEN"))
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    # Smart subsample BEFORE tokenizing everything
    ds = smart_subsample(
        jsonl_path=args.data,
        tokenizer=tok,
        target_minutes=args.target_minutes,
        step_time_guess=args.step_time_guess,
        bsz=args.bsz,
        grad_accum=args.grad_accum,
        epochs=args.epochs,
        seed=args.seed,
    )

    # Build model on single GPU (no sharding), SDPA attention
    model = LlamaForCausalLM.from_pretrained(
        args.base,
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
        torch_dtype=(torch.bfloat16 if use_bf16 else torch.float16),
        low_cpu_mem_usage=True,
        attn_implementation="sdpa",
        device_map=None,
    ).to("cuda")

    # LoRA config
    lcfg = LoraConfig(
        r=16, lora_alpha=32, lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    )
    model = get_peft_model(model, lcfg)

    # No gradient checkpointing on A100 (keeps it fast)
    # model.gradient_checkpointing_enable()  # <- leave OFF on A100
    # model.enable_input_require_grads()     # <- not needed without checkpointing
    # model.config.use_cache = False         # <- default ok

    collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False, pad_to_multiple_of=16)

    targs = TrainingArguments(
        output_dir=args.out,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.bsz,
        gradient_accumulation_steps=args.grad_accum,
        learning_rate=args.lr,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        weight_decay=0.0,
        logging_steps=25,
        save_strategy="no",   # skip epoch saves to save time; you can switch to 'epoch'
        report_to=[],
        bf16=use_bf16,
        fp16=not use_bf16,
        optim="adamw_torch",
        group_by_length=True,
        dataloader_num_workers=8,
        dataloader_pin_memory=True,
        gradient_checkpointing=False,
        remove_unused_columns=False,
    )

    trainer = Trainer(model=model, args=targs, train_dataset=ds, data_collator=collator)
    trainer.train()
    model.save_pretrained(args.out)
    print("Saved adapter to", args.out)

if __name__ == "__main__":
    main()

Overwriting /content/train_lora.py


In [1]:
from google.colab import files
up = files.upload()  # pick training_data_XXXX.jsonl
JSONL_PATH = list(up.keys())[0]
JSONL_PATH

ModuleNotFoundError: No module named 'google.colab'

In [None]:
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
OUT_DIR   = "/content/adapters/texttwin-XXXXXXXXXX"
EPOCHS    = 1                  # 1 epoch for a ~10-min pass
LR        = 2e-4
BSZ       = 4                  # try 4; if VRAM is huge you can try 8
GRAD_ACC  = 4                  # eff batch = 16
TARGET_MIN= 10
STEP_GUESS= 1.1                # seconds/step (adjust to your observed speed)

print("Base:", BASE_MODEL)
print("Data:", JSONL_PATH)
print("Out:", OUT_DIR)

!python3 /content/train_lora.py \
  --base "$BASE_MODEL" \
  --data "$JSONL_PATH" \
  --out  "$OUT_DIR" \
  --epochs $EPOCHS --lr $LR --bsz $BSZ --grad_accum $GRAD_ACC \
  --target_minutes $TARGET_MIN --step_time_guess $STEP_GUESS --seed 0

In [8]:
import os, zipfile
zip_path = OUT_DIR + ".zip"
def zip_dir(src_dir, zip_path):
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(src_dir):
            for f in files:
                full = os.path.join(root, f)
                rel = os.path.relpath(full, os.path.dirname(src_dir))
                zf.write(full, rel)
zip_dir(OUT_DIR, zip_path)
!ls -lh "$zip_path"

from google.colab import files
files.download(zip_path)