In [1]:
# !pip install trl evaluate rouge_score transformers peft

In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from trl import SFTTrainer
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Dataset

We load the *CARDBiomedBench* dataset from Hugging Face.

You can find more information about the dataset at **[Hugging Face](https://huggingface.co/datasets/NIH-CARD/CARDBiomedBench)**, or by reading the [paper](https://www.biorxiv.org/content/10.1101/2025.01.15.633272v2.full.pdf). 

In [4]:
dataset = load_dataset("NIH-CARD/CARDBiomedBench")
dataset

DatasetDict({
    train: Dataset({
        features: ['uuid', 'template_uuid', 'question', 'answer', 'bio_category', 'reasoning_category'],
        num_rows: 58079
    })
    test: Dataset({
        features: ['uuid', 'template_uuid', 'question', 'answer', 'bio_category', 'reasoning_category'],
        num_rows: 10148
    })
})

The dataset contains question-answer pairs:

In [5]:
question = dataset["train"][0]["question"]
answer = dataset["train"][0]["answer"]
print("Question: ", question)
print("Answer  : ", answer)

Question:  What genes does Olaparib target when it's used to treat cancer?
Answer  :  When used to treat cancer, the drug Olaparib targets the genes PARP2, PARP1, and PARP3.


Let's only use data samples related to category "Pharmacology"

In [6]:
dataset = dataset.filter(lambda x: x["bio_category"] == "Pharmacology")

For computational reasons, we select a smaller subset of the dataset

In [7]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# select a smaller subset
train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
test_dataset = test_dataset.shuffle(seed=42).select(range(200))

print(len(train_dataset))
print(len(test_dataset))

1000
200


In [8]:
question = train_dataset[0]["question"]
answer = train_dataset[0]["answer"]
print("Question: ", question)
print("Answer  : ", answer)

Question:  What type of molecule is Leuprolide Mesylate, and what is its action type?
Answer  :  Leuprolide Mesylate is a protein drug that acts as an agonist.


## Model

In this example, we use the [SmolLM2](https://huggingface.co./HuggingFaceTB/SmolLM2-135M) decoder model, developed by Hugging Face. The SmolLM2 models come in three sizes (135M, 360M, and 1.7B parameters) and are developed to solve a wide range of tasks while being lightweight enough to run on-device.
Here, we choose the 135M parameter model for computational reasons.

Let's load the model and tokenizer through Hugging Face:

In [9]:
model = "HuggingFaceTB/SmolLM2-135M"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model).to(device)

tokenizer.pad_token = tokenizer.eos_token  # set end-of-sequence token as padding token
model.config.pad_token_id = model.config.eos_token_id  # tell model which token to use for padding

In [10]:
model.num_parameters() # 134,515,008 parameters

134515008

Let's generate an example output of our base LLM.

Note that the model is not instruction-tuned (unlike ChatGPT). It is only trained to predict the next token in a sequence and is less useful for interactive tasks.

In [11]:
inputs = tokenizer("The capital of Sweden is ", return_tensors="pt", padding=True).to(device)

outputs = model.generate(inputs["input_ids"], 
                         attention_mask=inputs["attention_mask"],
                         pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0]))

The capital of Sweden is  the city of Stockholm. The city is located in the middle of the country, and is the


Even though 135M parameters is relatively "small" for an LLM, we want to further reduce the number of trainable parameters through LoRA. 

This becomes even more necessary when we choose to train larger models.

In [12]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


Let's test the SmolLM2 base model on some questions from our dataset...

In [13]:
def test_model(index, dataset, model, tokenizer):
    data = dataset[index]
    question = data["question"]

    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    # Format the input into instruction format
    prompt = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{question}\n\n"
        "### Response:\n"
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate response
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=100,    
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )

    # decode the response & remove special tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # remove prompt from response
    response = response[len(prompt):]

    expected_response = data["answer"]

    return question, response, expected_response

In [14]:
index = 0

question, response, expected_response = test_model(index, train_dataset, model, tokenizer)

print("question: \n", question, "\n")
print("model output: \n", response)
print("expected output: \n", expected_response)

question: 
 What type of molecule is Leuprolide Mesylate, and what is its action type? 

model output: 
 Leptolipin (LPL) has an active site that binds to the hydrophobic side chains in phospholipids such as cholesterol or triglycerides. This binding allows for lipophilic molecules like leprous bacteria to be transported into cells where they can multiply rapidly without being destroyed by phagocytes. The LPO-binding sites on these bacterial membranes also allow them to bind with other proteins which then act upon their own membrane lipids causing cell death when released from host tissues through endocytosis processes.

expected output: 
 Leuprolide Mesylate is a protein drug that acts as an agonist.


## Centralized Fine-tuning

We now want to fine-tune the model on the train dataset. For this, we convert the training data to instruction format. This is the correct format for generative question-answering tasks

In [15]:
def generate_instruction_format(example):
    question = example["question"]
    answer = example["answer"]

    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    prompt = (
        "### Instruction:\n"
        f"{instruction.strip()}\n\n"
        "### Input:\n"
        f"{question.strip()}\n\n"
        "### Response:\n"
        f"{answer.strip()}" + tokenizer.eos_token
    )
    return {"text": prompt}

In [16]:
mapped_train_dataset = train_dataset.map(
    generate_instruction_format,
    remove_columns=train_dataset.column_names,
    batched=False,
)
mapped_train_dataset[0]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map: 100%|██████████| 1000/1000 [00:00<00:00, 11386.74 examples/s]


{'text': '### Instruction:\nYou are a knowledgeable assistant. Answer this question truthfully!\n\n### Input:\nWhat type of molecule is Leuprolide Mesylate, and what is its action type?\n\n### Response:\nLeuprolide Mesylate is a protein drug that acts as an agonist.<|endoftext|>'}

In [17]:
use_cuda = torch.cuda.is_available()
print("cuda ", use_cuda)

model.train()

training_args = TrainingArguments(
    # output_dir="qa-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=5e-4,
    logging_steps=20,
    save_total_limit=2,
    use_cpu=not(use_cuda)
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=mapped_train_dataset
)

trainer.train()

cuda  True


Converting train dataset to ChatML: 100%|██████████| 1000/1000 [00:00<00:00, 56834.16 examples/s]
Adding EOS to train dataset: 100%|██████████| 1000/1000 [00:00<00:00, 40950.80 examples/s]
Tokenizing train dataset: 100%|██████████| 1000/1000 [00:00<00:00, 5216.56 examples/s]
Truncating train dataset: 100%|██████████| 1000/1000 [00:00<00:00, 304840.76 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
20,2.525
40,1.5491
60,0.9427
80,0.7794


TrainOutput(global_step=96, training_loss=1.3288022379080455, metrics={'train_runtime': 65.8502, 'train_samples_per_second': 45.558, 'train_steps_per_second': 1.458, 'total_flos': 148906714632192.0, 'train_loss': 1.3288022379080455})

Now we can check some outputs of our fine-tuned model

In [18]:
index = 1

question, response, expected_response = test_model(index, train_dataset, model, tokenizer)

print("\n")
print("question: ", question, "\n")
print("model output: ", response, "\n")
print("expected output: ", expected_response)



question:  What mechanism type does the drug Avutometinib use? 

model output:  The drug Avutmetinib is an inhibitor. 

expected output:  The drug Avutometinib is an inhibitor.


## Evaluation

Of course, we cannot check every output individually. 


Instead, we use **ROUGE-L** as a metric to evaluate the fine-tuned model on the test dataset.

The **ROUGE-L** score is based on the longest common subsequence (LCS) between the generated and the reference text. 
The LCS is the longest sequence of words that appear in order in both generated and reference text. 
The words do **NOT** need to be contiguous.

**Example:**

- *Reference text:* "The **kid** is **playing** with **the cat**"

- *Generated text:* "kid playing the cat"

In that case, the *LCS* is "kid playing the cat". 

To calculate the ROUGE-L score, we need the following information:

- length(LCS) = 4
- length(reference text) = 7
- length(generated text) = 4

Now, one can calculate Recall, Precision, and F1 Score

- *ROUGE-L Recall* = $\frac{\text{length(LCS)}}{\text{length(reference text)}} = \frac{4}{7} \approx 0.57$

- *ROUGE-L Precision* = $\frac{\text{length(LCS)}}{\text{length(generated text)}} = \frac{4}{4} = 1.0 $

- *ROUGE-L F1 Score* = $\frac{2 \cdot \text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}} = \approx 0.727 $

In [19]:
rouge = evaluate.load("rouge")

reference_text = ["The kid is playing with the cat"]
generated_text = ["kid playing the cat"]

results = rouge.compute(predictions=generated_text, references=reference_text)
results["rougeL"]

np.float64(0.7272727272727273)

Let's define a function that transforms the model predictions and the expected output into the correct format

In [25]:
def get_predictions(example):
    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    question = example["question"]

    # Format the input the same way as during training
    prompt = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{question}\n\n"
        "### Response:\n"
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            # max_new_tokens=100,
            # temperature=0.1,
            # top_p=0.9,
            # do_sample=True,
            # pad_token_id=tokenizer.eos_token_id,
            # repetition_penalty=1.2,
        )

    # Decode and clean up the response
    predicted_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from the response
    predicted_output = predicted_output[len(prompt):]

    return {
        "question": example["question"],
        "predicted_output": predicted_output,
        "correct_output": example["answer"],
    }

For computational reasons, we only evaluate the model on a small subset of the test data.

In [26]:
test_ds = test_dataset.select(range(30))

In [27]:
predictions_dataset = test_ds.map(get_predictions, batched=False, remove_columns=test_ds.column_names)
predictions_dataset[2]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:   3%|▎         | 1/30 [00:00<00:14,  1.95 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:   7%|▋         | 2/30 [00:00<00:10,  2.67 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:  10%|█         | 3/30 [00:01<00:11,  2.31 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:  13%|█▎        | 4/30 [00:01<00:10,  2.44 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:  17%|█▋        | 5/30 [00:02<00:11,  2.26 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:  20%|██        | 6/30 [00:02<00:09,  2.59 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Map:  23%|██▎       | 7/30 [00:02<00:09,  2.46 examples/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


{'question': 'How many mechanisms of action does the drug Bazedoxifene have?',
 'predicted_output': 'The drug Bazedoxifene has 1 mechanism of action, Bredin-type inhibitor',
 'correct_output': 'The drug Bazedoxifene has 1 mechanism of action, Estrogen receptor modulator.'}

In [28]:
predictions = predictions_dataset["predicted_output"]
references = predictions_dataset["correct_output"]

results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-L (F1): {results['rougeL']:.2%}")

ROUGE-L (F1): 75.46%


### Comparison to SmolLM2 base model (not fine-tuned)

In [30]:
# reload SmolLM2 base model
model_name = "HuggingFaceTB/SmolLM2-135M"
untrained_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [31]:
def get_untrained_predictions(example):
    instruction = "You are a knowledgeable assistant. Answer this question truthfully!"

    question = example["question"]

    # Format the input the same way as during training
    prompt = (
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{question}\n\n"
        "### Response:\n"
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate
    model.eval()
    with torch.no_grad():
        outputs = untrained_model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    pad_token_id=tokenizer.eos_token_id,
                    repetition_penalty=1.2) 
    
    predicted_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_output = predicted_output[len(prompt):]

    return {
        "question": example["question"],
        "predicted_output": predicted_output,
        "correct_output": example["answer"],
    }

In [32]:
untrained_predictions_dataset = test_ds.map(get_untrained_predictions, batched=False, remove_columns=test_ds.column_names)
untrained_predictions_dataset[2]

Map: 100%|██████████| 30/30 [00:52<00:00,  1.74s/ examples]


{'question': 'How many mechanisms of action does the drug Bazedoxifene have?',
 'predicted_output': 'The answer is 10, because there were two actions in the experiment and one was not observed (the other being an unknown). The number that you get from your calculator should be equal to or greater than 256.\n\n47893',
 'correct_output': 'The drug Bazedoxifene has 1 mechanism of action, Estrogen receptor modulator.'}

In [33]:
predictions = untrained_predictions_dataset["predicted_output"]
references = untrained_predictions_dataset["correct_output"]

results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-L (F1): {results['rougeL']:.2%}")

ROUGE-L (F1): 6.94%
