# Evaluate Model

## Part 1 Install and import libraries

In [None]:
!pip install -U datasets
!pip install -U bitsandbytes
!pip install tqdm

In [2]:
# Huggingface libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from datasets import load_dataset

# Pytorch
import torch

# Other libraries
import gc
from tqdm import tqdm

## Part 2 Set global parameters

In [3]:
# Model path in drive
baseline_model_path = "Qwen/Qwen2.5-1.5B"
finetune_model_path = "/content/drive/MyDrive/final_model"

# Datasets
dataset_name = "GBaker/MedQA-USMLE-4-options"

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Current device: {device}")

Current device: cuda


In [4]:
# Define inference prompt style
inference_prompt_style = """
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
{}

### Question:
{}

### Response:
"""

## Part 3 Load original model

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(baseline_model_path)

# Config bits and bytes quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    baseline_model_path,
    device_map = "auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

## Part 4 Evaluate original model

In [None]:
# Load dataset
benchmark_name = "GBaker/MedQA-USMLE-4-options"

benchmark = load_dataset(
    benchmark_name,
    split="test[0:100]",
    trust_remote_code=True
)

In [7]:
gc.collect()
torch.cuda.empty_cache()

real_ans = []
pred_ans = []
ques_detail = []
pred_detail = []
cnt = 0

for data in tqdm(benchmark):
    # Get info
    question = data["question"] + "\n"
    option = "\n".join([key + ":" + val for key, val in data["options"].items()])
    real_answer = data["answer_idx"]



    inputs = tokenizer(
    [inference_prompt_style.format("Please answers the following medical quesion only with the option letter.",
                                   question + option)],
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1024,
        use_cache=True,
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_answer = response[0].split("### Response:")[1]

    if "D" in predicted_answer:
        predicted_answer = "D"
    elif "C" in predicted_answer:
        predicted_answer = "C"
    elif "B" in predicted_answer:
        predicted_answer = "B"
    elif "A" in predicted_answer:
        predicted_answer = "A"
    else:
        predicted_answer = "Z"


    real_ans.append(real_answer)
    pred_ans.append(predicted_answer)
    ques_detail.append(question + option)
    pred_detail.append(response)

100%|██████████| 100/100 [19:41<00:00, 11.82s/it]


In [11]:
cnt = 0
for i in range(len(real_ans)):
    if real_ans[i] == pred_ans[i]:
        cnt += 1
print(cnt/len(real_ans))

0.31


In [12]:
for i in range(100):
    print(f"No. {i}")
    print(real_ans[i], pred_ans[i], pred_detail[i][0].split("### Response:")[1])

No. 0
B D 
The correct next action for the resident to take is A: Disclose the error to the patient and put it in the operative report. This is because the resident has inadvertently cut a flexor tendon during the carpal tunnel repair, and the attending physician has already acknowledged that the patient will do fine and that there is no need to worry unnecessarily. However, the resident should still disclose the error to the patient and put it in the operative report to ensure that the patient is aware of the complication and can take appropriate action if necessary.
No. 1
D D 
To determine the most likely action of the drug causing the patient's symptoms, we need to consider the known mechanisms of action for each option:

A: Inhibition of proteasome - This is a common mechanism of action for many chemotherapeutic agents, including those used in neoadjuvant chemotherapy. Proteasome inhibitors can lead to DNA damage and cell cycle arrest, which can contribute to hearing loss.

B: Hype

## Part 5 Load fine-tuned model

In [8]:
# Mount google drive
from google.colab import drive
drive.mount("/content/drive")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(finetune_model_path)

# Config bits and bytes quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    finetune_model_path,
    device_map = "auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

## Part 6 Evaluate fine-tuned model

In [9]:
# Load dataset
benchmark_name = "GBaker/MedQA-USMLE-4-options"

benchmark = load_dataset(
    benchmark_name,
    split="test[0:100]",
    trust_remote_code=True
)

In [10]:
gc.collect()
torch.cuda.empty_cache()

real_ans = []
pred_ans = []
ques_detail = []
pred_detail = []
cnt = 0

for data in tqdm(benchmark):
    # Get info
    question = data["question"] + "\n"
    option = "\n".join([key + ":" + val for key, val in data["options"].items()])
    real_answer = data["answer_idx"]



    inputs = tokenizer(
    [inference_prompt_style.format("Please answers the following medical quesion only with the option letter.",
                                   question + option)],
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1024,
        use_cache=True,
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_answer = response[0].split("### Response:")[1]

    if "D" in predicted_answer:
        predicted_answer = "D"
    elif "C" in predicted_answer:
        predicted_answer = "C"
    elif "B" in predicted_answer:
        predicted_answer = "B"
    elif "A" in predicted_answer:
        predicted_answer = "A"
    else:
        predicted_answer = "Z"


    real_ans.append(real_answer)
    pred_ans.append(predicted_answer)
    ques_detail.append(question + option)
    pred_detail.append(response)

100%|██████████| 100/100 [1:41:46<00:00, 61.07s/it]


In [11]:
cnt = 0
for i in range(len(real_ans)):
    if real_ans[i] == pred_ans[i]:
        cnt += 1
print(cnt/len(real_ans))

0.35


In [12]:
for i in range(100):
    print(f"No. {i}")
    print(real_ans[i], pred_ans[i], pred_detail[i][0].split("### Response:")[1])

No. 0
B C 
<think>
Okay, so we have a junior orthopaedic surgery resident who accidentally cuts a flexor tendon during a carpal tunnel repair. The tendon is repaired, but the attending physician, who is the department chairman, doesn't want to make the patient worry about it. The attending says the patient will be fine, and there's no need to worry about this minor complication. He tells the resident to leave it out of the operative report. 

Now, let's think about what's going on here. The resident is the one who made the mistake, and the attending is the one who knows about it. So, the resident should definitely be the one to report the error. It's his responsibility to ensure that the patient is informed about any complications that might arise from the surgery. 

But wait, the attending is the department chairman, and he's the one who's supposed to be overseeing the resident's work. If the resident is the one who made the mistake, it's his responsibility to report it. The attending