# ECE 284 Digital Health System
## Supervised Fine-tune Model

## Part 1 Install and import libraries

In [None]:
!pip install -U peft
!pip install -U datasets
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U trl
!pip install tqdm

In [21]:
# Python built-in libraries
import os

# Hugging face libraries
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# Pytorch libraries
import torch

# Other libraries
from tqdm import tqdm
import gc

## Part 2 Set global parameters

In [3]:
# Baseline model
model_name = "Qwen/Qwen2.5-1.5B"

# Datasets
dataset_name = "FreedomIntelligence/medical-o1-reasoning-SFT"

# Output dir
output_path = "output_models"
if not os.path.exists(output_path):
    os.mkdir(output_path)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Define training prompt style
train_prompt_style = """
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

In [5]:
# Define inference prompt style
inference_prompt_style = """
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
{}

### Question:
{}

### Response:
"""

## Part 3 Load model

In [6]:
# Load tokenizer and base model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Config bits and bytes quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

model.config.use_cache = False
# model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

## Part 4 Load dataset

In [7]:
# Define dataset preprocess function
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts(data):
    inputs = data["Question"]
    complex_cots = data["Complex_CoT"]
    outputs = data["Response"]
    texts = []
    for question, cot, response in zip(inputs, complex_cots, outputs):
        # Append the EOS token to the response if it's not already there
        if not response.endswith(EOS_TOKEN):
            response += EOS_TOKEN
        text = train_prompt_style.format(question, cot, response)
        texts.append(text)
    return {"text": texts}

In [8]:
# Load dataset
dataset = load_dataset(
    dataset_name,
    "en",
    split="train[0:3000]",
    trust_remote_code=True
)

# Preprocess dataset
dataset = dataset.map(formatting_prompts, batched=True)

# Convert to data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

## Part 5 Test base model

In [9]:
# Test base model
question = dataset[10]['Question']
print(inference_prompt_style.format("Please answers the following medical quesion.",question))
inputs = tokenizer(
    [inference_prompt_style.format("Please answers the following medical quesion.",
                                   question) + "<think>"],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=1024,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])


Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answers the following medical quesion.

### Question:
In a patient with dermatomyositis as indicated by fatigue, muscle weakness, a scaly rash, elevated creatine kinase-MB, anti-Jo-1 antibodies, and perimysial inflammation, which type of cancer is most often associated with this condition?

### Response:


<think>1. Dermatomyositis is a rare autoimmune disease characterized by muscle weakness and inflammation.
2. The patient has elevated creatine kinase-MB, which suggests muscle damage.
3. Anti-Jo-1 antibodies are specific to a particular type of cancer, namely

## Part 6 Config train parameters

In [10]:
# LoRA config
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

model = get_peft_model(model, peft_config)

In [11]:
# Training Arguments
training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    logging_steps=50,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none"
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

Converting train dataset to ChatML:   0%|          | 0/3000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/3000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Part 7 Train

In [12]:
# Train model
gc.collect()
torch.cuda.empty_cache()

model.config.use_cache = False
trainer.train()

Step,Training Loss
50,1.5605
100,1.4534
150,1.4284
200,1.4085
250,1.4018
300,1.4111
350,1.3981
400,1.3882
450,1.4004
500,1.3706


TrainOutput(global_step=1500, training_loss=1.3937197875976564, metrics={'train_runtime': 7120.4322, 'train_samples_per_second': 0.421, 'train_steps_per_second': 0.211, 'total_flos': 1.7102387324030976e+16, 'train_loss': 1.3937197875976564})

In [13]:
# Save model to google drive
from google.colab import drive
drive.mount('/content/drive')

# Save model
model.save_pretrained(f"/content/drive/MyDrive/final_model")
tokenizer.save_pretrained(f"/content/drive/MyDrive/final_model")

Mounted at /content/drive


('/content/drive/MyDrive/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/final_model/chat_template.jinja',
 '/content/drive/MyDrive/final_model/vocab.json',
 '/content/drive/MyDrive/final_model/merges.txt',
 '/content/drive/MyDrive/final_model/added_tokens.json',
 '/content/drive/MyDrive/final_model/tokenizer.json')

## Part 8 Test fine-tuned model

In [17]:
# Test base model
gc.collect()
torch.cuda.empty_cache()

question = dataset[10]['Question']
print(inference_prompt_style.format("Please answers the following medical quesion.",question))
inputs = tokenizer(
    [inference_prompt_style.format("Please answers the following medical quesion.",
                                   question) + "<think>"],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=1024,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0].split("### Response:")[1])


Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answers the following medical quesion.

### Question:
In a patient with dermatomyositis as indicated by fatigue, muscle weakness, a scaly rash, elevated creatine kinase-MB, anti-Jo-1 antibodies, and perimysial inflammation, which type of cancer is most often associated with this condition?

### Response:


<think>Okay, let's think about this. We've got a patient with dermatomyositis, which is a condition that affects muscles and skin. It's known for causing fatigue, muscle weakness, and a scaly rash. Plus, there's this elevated creatine kinase-MB level, which i

## Part Alpha test

In [18]:
# Load dataset
benchmark_name = "GBaker/MedQA-USMLE-4-options"

benchmark = load_dataset(
    benchmark_name,
    split="test[0:100]",
    trust_remote_code=True
)

README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

phrases_no_exclude_train.jsonl:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

phrases_no_exclude_test.jsonl:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

In [22]:
gc.collect()
torch.cuda.empty_cache()

real_ans = []
pred_ans = []
pred_detail = []
cnt = 0

for data in tqdm(benchmark):
    # Get info
    question = data['question'] + '\n'
    option = "\n".join([key + ':' + val for key, val in data['options'].items()])
    real_answer = data['answer_idx']



    inputs = tokenizer(
    [inference_prompt_style.format("Please answers the following medical quesion only with the option letter.",
                                   question + option)],
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1024,
        use_cache=True,
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_answer = response[0].split("### Response:")[1]

    if "D" in predicted_answer:
        predicted_answer = "D"
    elif "C" in predicted_answer:
        predicted_answer = "C"
    elif "B" in predicted_answer:
        predicted_answer = "B"
    elif "A" in predicted_answer:
        predicted_answer = "A"
    else:
        predicted_answer = "Z"


    real_ans.append(real_answer)
    pred_ans.append(predicted_answer)
    pred_detail.append(response)

100%|██████████| 100/100 [1:52:41<00:00, 67.62s/it]


In [23]:
cnt = 0
for i in range(len(real_ans)):
    if real_ans[i] == pred_ans[i]:
        cnt += 1
print(cnt/len(real_ans))

0.34


In [24]:
print(real_ans[0:10])
print(pred_ans[0:10])

['B', 'D', 'B', 'D', 'B', 'D', 'C', 'C', 'B', 'A']
['D', 'D', 'C', 'D', 'D', 'D', 'D', 'C', 'D', 'D']


In [25]:
print(pred_detail[0:10])

['D', 'D', 'C', 'D', 'D', 'D', 'D', 'C', 'D', 'D']
