In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

from transformers import TrainingArguments, DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def construct_verification_prompt(prediction: str, correct_answer: str) -> str:
    return f"""You are an expert math evaluator.

1. Extract the final numerical answer from the model's prediction.
2. Compare it to the correct answer, allowing for equivalent expressions (e.g., 0.5 and 1/2 are the same).
3. Respond **concisely**, with your final verdict in a box:

If the answer is correct, reply with \\boxed{{Yes}}

If the answer is incorrect, reply with \\boxed{{No}}

Correct Answer:  
{ground_truth}

Model Prediction:  
{prediction}

Is the model's answer mathematically correct? ONLY include a brief one-line explanation and the verdict."""


In [4]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-3.2-1b"

# sample dataset
guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# train datasets
gsm8k_dataset = "openai/gsm8k"
mbpppro_dataset = "CodeEval-Pro/mbpp-pro"
arc_dataset = "allenai/ai2_arc"
textvqa_dataset = "facebook/textvqa"

# test datasets
math_dataset = "EleutherAI/hendrycks_math"
humaneval_dataset = "openai/openai_humaneval"
okvqa_dataset = "lmms-lab/OK-VQA"
# Fine-tuned model
new_model = "llama-3.2-1b-guanaco"

In [None]:
# guanaco_dataset = load_dataset(guanaco_dataset, split="train")
# gsm8k_dataset = load_dataset(gsm8k_dataset, "main", split="train")
# mbpppro_dataset = load_dataset(mbpppro_dataset, split="train")
# arc_dataset = load_dataset(arc_dataset, "ARC-Easy", split="train")
# textvqa_dataset = load_dataset(textvqa_dataset, split="test", trust_remote_code=True) # 7 Gb
# math_dataset = load_dataset(math_dataset, "algebra", split="test") # pick one among the available configs: ['algebra', 'counting_and_probability', 'geometry', 'intermediate_algebra', 'number_theory', 'prealgebra', 'precalculus']
# humaneval_dataset = load_dataset(humaneval_dataset, split="test")
# arc_challenge_dataset = load_dataset(arc_dataset, "ARC-Challenge", split="test")
# okvqa_dataset = load_dataset(okvqa_dataset, split="val2014") # validation?

In [4]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [9]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [10]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    evaluation_strategy='no',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)



In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define data collator for padding
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    # dataset_text_field="text",
    # max_seq_length=None,
    # tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_params,
    # packing=False,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask,labels.