In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer
import evaluate

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Dataset

We load the *CARDBiomedBench* dataset from Hugging Face.

You can find more information about the dataset at **[Hugging Face](https://huggingface.co/datasets/NIH-CARD/CARDBiomedBench)**, or by reading the [paper](https://www.biorxiv.org/content/10.1101/2025.01.15.633272v2.full.pdf). 

In [None]:
dataset = load_dataset("NIH-CARD/CARDBiomedBench")
dataset

The dataset contains question-answer pairs:

In [None]:
question = dataset["train"][0]["question"]
answer = dataset["train"][0]["answer"]
print("Question: ", question)
print("Answer  : ", answer)

Let's only use data samples related to category "Pharmacology"

In [None]:
dataset = dataset.filter(lambda x: x["bio_category"] == "Pharmacology")

train_dataset = dataset["train"]
test_dataset = dataset["test"]

# for computational resons, select a smaller subset
train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
test_dataset = test_dataset.shuffle(seed=42).select(range(200))

print("Num samples in train set: ", len(train_dataset))
print("Num samples in test set : ", len(test_dataset))

In [None]:
question = train_dataset[0]["question"]
answer = train_dataset[0]["answer"]
print("Question: ", question)
print("Answer  : ", answer)

## Model

In this example, we use the [SmolLM2](https://huggingface.co./HuggingFaceTB/SmolLM2-135M) decoder model, developed by Hugging Face. The SmolLM2 models come in three sizes (135M, 360M, and 1.7B parameters) and are developed to solve a wide range of tasks while being lightweight enough to run on-device.
Here, we choose the 135M parameter model for computational reasons.

Let's load the model and tokenizer through Hugging Face:

In [12]:
model = "HuggingFaceTB/SmolLM2-135M"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model).to(device)

tokenizer.pad_token = tokenizer.eos_token  # set end-of-sequence token as padding token
model.config.pad_token_id = model.config.eos_token_id  # tell model which token to use for padding

In [None]:
model.num_parameters() # 134,515,008 parameters

Let's generate an example output of our base LLM.

Note that the model is not instruction-tuned (unlike ChatGPT). It is only trained to predict the next token in a sequence and is less useful for interactive tasks.

In [None]:
inputs = tokenizer("The capital of Sweden is ", return_tensors="pt", padding=True).to(device)
outputs = model.generate(inputs["input_ids"], 
                         attention_mask=inputs["attention_mask"],
                         pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0]))

Even though 135M parameters is relatively "small" for an LLM, we want to further reduce the number of trainable parameters through LoRA. 

This becomes even more necessary when we choose to fine-tune larger models.

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

By training only 0.34 % of the total parameters, we update only a small fraction of the total parameters!

Let's test the SmolLM2 base model on some questions from our dataset...

In [16]:
def test_model(index, dataset, model, tokenizer):
    data = dataset[index]
    question = data["question"]

    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    # format the input into instruction format
    prompt = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{question}\n\n"
        "### Response:\n"
    )

    # tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # generate response
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=100,    
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    # decode the response & remove special tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # remove prompt from response
    response = response[len(prompt):]

    expected_response = data["answer"]

    return question, response, expected_response

In [None]:
index = 0

question, response, expected_response = test_model(index, train_dataset, model, tokenizer)

print("question: \n", question, "\n")
print("model output: \n", response)
print("expected output: \n", expected_response)

## Centralized Fine-tuning

We now want to fine-tune the model on the train dataset. For this, we convert the training data to instruction format. This is the correct format for generative question-answering tasks

In [18]:
def generate_instruction_format(example):
    question = example["question"]
    answer = example["answer"]

    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    prompt = (
        "### Instruction:\n"
        f"{instruction.strip()}\n\n"
        "### Input:\n"
        f"{question.strip()}\n\n"
        "### Response:\n"
        f"{answer.strip()}" + tokenizer.eos_token
    )
    return {"text": prompt}

In [None]:
mapped_train_dataset = train_dataset.map(
    generate_instruction_format,
    remove_columns=train_dataset.column_names,
    batched=False,
)
mapped_train_dataset[0]

In [None]:
use_cuda = torch.cuda.is_available()
print("cuda ", use_cuda)

model.train()

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=5e-4,
    logging_steps=20,
    save_total_limit=2,
    use_cpu=not(use_cuda)
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=mapped_train_dataset,
)

trainer.train()

Now we can check some outputs of our fine-tuned model

In [None]:
index = 1

question, response, expected_response = test_model(index, train_dataset, model, tokenizer)

print("\n")
print("question: ", question, "\n")
print("model output: ", response, "\n")
print("expected output: ", expected_response)

In [None]:
index = 2

question, response, expected_response = test_model(index, test_dataset, model, tokenizer)

print("\n")
print("question: ", question, "\n")
print("model output: ", response, "\n")
print("expected output: ", expected_response)

## Evaluation

Of course, we cannot check every output individually. 


Instead, we use **ROUGE-L** as a metric to evaluate the fine-tuned model on the test dataset.

The **ROUGE-L** score is based on the longest common subsequence (LCS) between the generated and the reference text. 
The LCS is the longest sequence of words that appear in order in both generated and reference text. 
The words do **NOT** need to be contiguous.

**Example:**

- *Reference text:* "The **kid** is **playing** with **the cat**"

- *Generated text:* "kid playing the cat"

In that case, the *LCS* is "kid playing the cat". 

To calculate the ROUGE-L score, we need the following information:

- length(LCS) = 4
- length(reference text) = 7
- length(generated text) = 4

Now, one can calculate Recall, Precision, and F1 Score

- *ROUGE-L Recall* = $\frac{\text{length(LCS)}}{\text{length(reference text)}} = \frac{4}{7} \approx 0.57$

- *ROUGE-L Precision* = $\frac{\text{length(LCS)}}{\text{length(generated text)}} = \frac{4}{4} = 1.0 $

- *ROUGE-L F1 Score* = $\frac{2 \cdot \text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}} = \approx 0.73 $

In [None]:
rouge = evaluate.load("rouge")

reference_text = ["The kid is playing with the cat"]
generated_text = ["kid playing the cat"]

results = rouge.compute(predictions=generated_text, references=reference_text)

rouge_l = float(round(results["rougeL"], 2))
print("rouge_l score: ", rouge_l)

Let's define a function that generates model predictions on the test dataset and returns them together with the expected output.

In [38]:
def get_predictions(example):
    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    question = example["question"]

    # format the input the same way as during training
    prompt = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{question}\n\n"
        "### Response:\n"
    )

    # tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # generate
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
        )

    # decode and clean up the response
    predicted_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # remove the prompt from the response
    predicted_output = predicted_output[len(prompt):]

    return {
        "question": example["question"],
        "predicted_output": predicted_output,
        "correct_output": example["answer"],
    }

For computational reasons, we only evaluate the model on a small subset of the test data.

In [39]:
test_ds = test_dataset.select(range(30))

In [None]:
predictions_dataset = test_ds.map(get_predictions, batched=False, remove_columns=test_ds.column_names)
predictions_dataset[2]

In [None]:
predictions = predictions_dataset["predicted_output"]
references = predictions_dataset["correct_output"]

results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-L (F1): {results['rougeL']:.2%}")

### Comparison to SmolLM2 base model (not fine-tuned)

In [42]:
# reload SmolLM2 base model
model_name = "HuggingFaceTB/SmolLM2-135M"
untrained_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [43]:
def get_untrained_predictions(example):
    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    question = example["question"]

    # Format the input the same way as during training
    prompt = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{question}\n\n"
        "### Response:\n"
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate
    model.eval()
    with torch.no_grad():
        outputs = untrained_model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    pad_token_id=tokenizer.eos_token_id,
                    repetition_penalty=1.2) 
    
    predicted_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_output = predicted_output[len(prompt):]

    return {
        "question": example["question"],
        "predicted_output": predicted_output,
        "correct_output": example["answer"],
    }

In [None]:
untrained_predictions_dataset = test_ds.map(get_untrained_predictions, batched=False, remove_columns=test_ds.column_names)
untrained_predictions_dataset[2]

In [None]:
predictions = untrained_predictions_dataset["predicted_output"]
references = untrained_predictions_dataset["correct_output"]

results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-L (F1): {results['rougeL']:.2%}")

ROUGE-L (F1) of fine-tuned model: ~ 70% :)