In [None]:
# !pip install peft
!pip install bitsandbytes
# !pip install transformers
!pip install trl
# !pip install datasets
# !pip install nltk
# !pip install rouge_score
# !pip install bert_score



In [None]:
!pip install rouge_score
!pip install bert_score



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, TrainerCallback
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from datasets import load_dataset
from sklearn.model_selection import KFold
import torch
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score

In [None]:
# model_name = "meta-llama/Meta-Llama-3-8B"
model_name = "meta-llama/Llama-3.2-1B"

In [None]:
# 0) load full data
dataset = load_dataset("ShenLab/MentalChat16k", split="train")

# 1) randomly carve out 20 examples for eval
split = dataset.train_test_split(
    test_size=20,      # 20 examples
    shuffle=True,
    seed=42
)
train_dataset = split["train"]   # all the rest
eval_dataset  = split["test"]    # exactly 20 examples

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
!pip install bitsandbytes




In [None]:
import os

HF_TOKEN = os.getenv("HF_TOKEN")
print(HF_TOKEN)  # Just to check if it's working; remove after verifying


In [None]:
from huggingface_hub import login
login(HF_TOKEN)



In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    EvalPrediction,
    TrainerCallback,
)
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
from huggingface_hub import login
import torch, math, numpy as np, os
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import bert_score

# ── 0) Authenticate & environment tweaks ─────────────────────────────
# Make sure your HF_TOKEN has “read” enabled for gated repos (Settings → Tokens)
login(token=HF_TOKEN)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["HF_HUB_ENABLE_GATED_REPO_DOWNLOAD"] = "1"

# ── 1) Load and split ────────────────────────────────────────────────
dataset = load_dataset("ShenLab/MentalChat16k", split="train")
split   = dataset.train_test_split(test_size=20, shuffle=True, seed=42)
train_dataset, eval_dataset = split["train"], split["test"]

# ── 2) Tokenizer & Bits‑and‑Bytes quantization config ───────────────
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    trust_remote_code=True,
    token=HF_TOKEN,
)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=True,
)

# ── 3) SFT training arguments ────────────────────────────────────────
sft_config = SFTConfig(
    output_dir="qlora-llama3-step1",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    max_steps=100,
    num_train_epochs=2,
    save_strategy="steps",
    save_steps=1,
    logging_dir="logs",
    logging_steps=1,
    eval_steps=1,
    learning_rate=2e-4,
    fp16=True,
    report_to="none",
    load_best_model_at_end=False,
    max_length=1024,
    packing=True,
    dataset_text_field="output",
    remove_unused_columns=False,
)

# ── 4) Custom callback to save best model + tokenizer ───────────────
class SaveBestModelCallback(TrainerCallback):
    def __init__(self, tokenizer, output_dir="best-qlora-llama3_1"):
        super().__init__()
        self.best_perplexity = float("inf")
        self.output_dir = output_dir
        self.tokenizer = tokenizer

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None or "eval_perplexity" not in metrics:
            return control
        curr_ppl = metrics["eval_perplexity"]
        if curr_ppl < self.best_perplexity:
            self.best_perplexity = curr_ppl
            model = kwargs["model"]
            model.save_pretrained(self.output_dir)
            self.tokenizer.save_pretrained(self.output_dir)
        return control

# ── 5) Load & prepare the quantized model ───────────────────────────
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    token=HF_TOKEN,
)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
device = model.device

# ── 6) Metrics function ─────────────────────────────────────────────
def compute_metrics(eval_pred):
    if isinstance(eval_pred, EvalPrediction):
        logits, labels = eval_pred.predictions, eval_pred.label_ids
    else:
        logits, labels = eval_pred

    # 1) Perplexity
    logits_t = torch.tensor(logits).to(device)
    labels_t = torch.tensor(labels).to(device)
    shift_logits = logits_t[:, :-1, :].contiguous()
    shift_labels = labels_t[:, 1:].contiguous()
    loss = torch.nn.CrossEntropyLoss()(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1),
    )
    ppl = math.exp(loss.item())

    # 2) Decode preds & refs
    pred_ids   = np.argmax(logits, axis=-1)
    labels_ids = np.where(labels != -100, labels, tokenizer.pad_token_id)
    dec_preds  = tokenizer.batch_decode(pred_ids,   skip_special_tokens=True)
    dec_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # 3) BLEU
    bleu = float(np.mean([
        sentence_bleu([ref.split()], pred.split())
        for ref, pred in zip(dec_labels, dec_preds)
    ]))

    # 4) ROUGE‑L
    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    rougeL = float(np.mean([
        rouge.score(ref, pred)["rougeL"].fmeasure
        for ref, pred in zip(dec_labels, dec_preds)
    ]))

    # 5) BERTScore F1
    _, _, F1 = bert_score.score(
        dec_preds, dec_labels, lang="en", verbose=False
    )
    bertscore_f1 = float(F1.mean().item())

    return {
        "perplexity":   ppl,
        "bleu":         bleu,
        "rougeL":       rougeL,
        "bertscore_f1": bertscore_f1,
    }

# ── 7) Initialize trainer ────────────────────────────────────────────
best_cb = SaveBestModelCallback(tokenizer=tokenizer,
                                output_dir="best-qlora-llama3")

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=["q_proj","v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    ),
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[best_cb],
)

# ── 8) Train & evaluate ─────────────────────────────────────────────
trainer.train()
metrics = trainer.evaluate()
print(metrics)

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Converting train dataset to ChatML:   0%|          | 0/16064 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/16064 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16064 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/16064 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,1.665
2,1.7615
3,1.8747
4,1.7927
5,1.5833
6,1.5824
7,1.4329
8,1.6441
9,1.697
10,1.7066



403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository..
Cannot access content at: https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Make sure your token has the correct permissions. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository..
Cannot access content at: https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Make sure your token has the correct permissions. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository..
Cannot access content at: https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Make sure your token has the correct permissions. -

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository..
Cannot access content at: https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Make sure your token has the correct permissions. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.


{'eval_loss': 1.584697961807251, 'eval_perplexity': 4.88760411600726, 'eval_bleu': 0.2165336532454404, 'eval_rougeL': 0.5613020030045427, 'eval_bertscore_f1': 0.8693522810935974, 'eval_runtime': 15.1272, 'eval_samples_per_second': 0.463, 'eval_steps_per_second': 0.463}


In [None]:
# !rm -rf best-qlora-llama3 qlora-llama3-fold1 qlora-llama3-step1

In [None]:
# from within Colab, prefix with ! to run shell
# !zip -r qlora-llama3-step1.zip qlora-llama3-step1
!zip -r best-qlora-llama3.zip best-qlora-llama3

  adding: best-qlora-llama3/ (stored 0%)
  adding: best-qlora-llama3/tokenizer_config.json (deflated 96%)
  adding: best-qlora-llama3/special_tokens_map.json (deflated 64%)
  adding: best-qlora-llama3/adapter_config.json (deflated 53%)
  adding: best-qlora-llama3/README.md (deflated 66%)
  adding: best-qlora-llama3/adapter_model.safetensors (deflated 8%)
  adding: best-qlora-llama3/tokenizer.json (deflated 85%)


In [None]:
print(dataset.column_names)
# e.g. ['instruction','input','output']


['instruction', 'input', 'output']
