In [1]:
!pip install -q transformers trl peft bitsandbytes datasets
!pip install -q rouge_score bert_score
!pip install -q evaluate nltk

In [2]:
import torch
import os
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)

from trl import SFTTrainer, SFTConfig
from tqdm import tqdm
import evaluate
import numpy as np

In [3]:
import warnings
warnings.filterwarnings("ignore", message=".*use_reentrant.*")

In [4]:
class Config:
    base_model_id = "Qwen/Qwen2.5-3B"
    sft_model_id = "./qwen-2.5-3b-sft-truthfulqa/sft"
    dpo_model_id = "./qwen-2.5-3b-dpo-truthfulqa/dpo"

    dataset_id = "truthfulqa/truthful_qa"

config = Config()

In [5]:
def format_sample(sample):
    question = sample.get("question", "").strip()
    answer = sample.get("best_answer", "").strip()

    formatted_text = f"Question: {question}\nAnswer: {answer}"
    
    return {"text": formatted_text}


In [6]:
full_dataset = load_dataset(config.dataset_id, "generation")["validation"]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [8]:
def tokenize_dataset(sample):
    formatted = format_sample(sample)
    full_text = formatted["text"]

    outputs = tokenizer(
        full_text,
        truncation=True,
        max_length=1024,
        padding=False,
        return_tensors=None,
        return_offsets_mapping=True
    )

    input_ids = outputs["input_ids"]
    attention_mask = outputs["attention_mask"]
    offsets = outputs["offset_mapping"]


    answer_start_char = full_text.find("Answer:")
    if answer_start_char == -1:
        labels = input_ids.copy()
    else:
        labels = []
        for idx, (token_id, offset) in enumerate(zip(input_ids, offsets)):
            char_start, _ = offset
            if char_start >= answer_start_char:
                labels.append(token_id)
            else:
                labels.append(-100)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [9]:
tokenized_dataset = full_dataset.map(
    tokenize_dataset,
    batched=False,
    remove_columns=full_dataset.column_names
)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
temp_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)


dataset_split = {
    "train": split_dataset["train"],
    "validation": temp_split["train"],
    "test": temp_split["test"]
}

Map:   0%|          | 0/817 [00:00<?, ? examples/s]

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
sft_training_args = SFTConfig(
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    warmup_steps=100,
    num_train_epochs=50,
    learning_rate=4e-5,
    fp16=True,
    bf16=False,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=42,
    output_dir=config.sft_model_id,
    report_to="tensorboard",
    logging_steps=1,
    eval_strategy="epoch",
    save_total_limit=1,
    save_strategy="best",
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    max_seq_length=2048,
    dataset_num_proc=4,
    packing=False,
)

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [14]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
exact_match_metric = evaluate.load("exact_match")


def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Handle tuple for predictions
    predictions = predictions[0] if isinstance(predictions, tuple) else predictions

    # Convert logits to predicted token IDs if needed
    if predictions.ndim == 3:
        predictions = predictions.argmax(-1)

    # Convert tensors to lists
    if hasattr(predictions, "tolist"):
        predictions = predictions.tolist()
    if hasattr(labels, "tolist"):
        labels = labels.tolist()

    # Replace -100 in labels with tokenizer.pad_token_id for decoding
    labels = [
        [token if token != -100 else tokenizer.pad_token_id for token in label_seq]
        for label_seq in labels
    ]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip extra whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    return {
        "bleu": bleu_result["bleu"],
        "rouge1": rouge_result["rouge1"],
        "rougeL": rouge_result["rougeL"],
        "bertscore_f1": np.mean(bertscore_result["f1"]),
    }


In [15]:
sft_trainer = SFTTrainer(
    model=model,
    args=sft_training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["validation"],
    peft_config=lora_config,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


Truncating train dataset (num_proc=4):   0%|          | 0/735 [00:00<?, ? examples/s]

Truncating eval dataset (num_proc=4):   0%|          | 0/41 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
train_history = sft_trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rougel,Bertscore F1
1,2.4414,2.385911,0.118612,0.501948,0.454086,0.864638
2,2.3956,2.359903,0.114494,0.504731,0.45833,0.866696
3,2.1507,2.314378,0.117668,0.510264,0.46332,0.867438
4,2.443,2.245841,0.141375,0.521919,0.471874,0.862413
5,2.269,2.159682,0.158197,0.526713,0.477122,0.863422
6,2.2409,2.069691,0.16847,0.543209,0.488836,0.867555
7,2.1929,1.978427,0.183986,0.572892,0.521685,0.871015
8,2.0683,1.878253,0.187286,0.570884,0.522619,0.871778
9,1.683,1.783689,0.193743,0.573189,0.526655,0.873448
10,1.7537,1.730028,0.192218,0.574093,0.526623,0.873321


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
train_history

TrainOutput(global_step=324, training_loss=1.835309867505674, metrics={'train_runtime': 343.2695, 'train_samples_per_second': 107.059, 'train_steps_per_second': 1.748, 'total_flos': 1.6464668845522944e+16, 'train_loss': 1.835309867505674})

In [18]:
eval_history = sft_trainer.evaluate(dataset_split["test"])

In [19]:
eval_history

{'eval_loss': 1.5612815618515015,
 'eval_bleu': 0.285689956313672,
 'eval_rouge1': 0.6070493544972422,
 'eval_rougeL': 0.5751630982614317,
 'eval_bertscore_f1': 0.8836286445943321,
 'eval_runtime': 1.53,
 'eval_samples_per_second': 26.797,
 'eval_steps_per_second': 3.921}

In [21]:
%pwd

'/teamspace/studios/this_studio'

In [22]:
import zipfile
import os

def zip_all_files(output_filename='stage-1.zip', directory='qwen-2.5-3b-sft-truthfulqa'):
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for foldername, subfolders, filenames in os.walk(directory):
            for filename in filenames:
                file_path = os.path.join(foldername, filename)
                # Skip hidden files and system files if desired
                if not filename.startswith('.') and '__pycache__' not in file_path:
                    zipf.write(file_path, os.path.relpath(file_path, directory))

zip_all_files()
