# Evaluate Model

## Part 1 Install and import libraries

In [None]:
!pip install -U datasets

In [None]:
# Huggingface libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset

# Pytorch
import torch

## Part 2 Set global parameters

In [None]:
# Model path in drive
model_path = f"/content/drive/MyDrive/ECE284_SP25/output_models/model_05012025"

# Datasets
dataset_name = "GBaker/MedQA-USMLE-4-options"

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Current device: {device}")

Current device: cuda


## Part 3 Load fine-tuned model

In [None]:
# Model path
from google.colab import drive
drive.mount('/content/drive')

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model
config = PeftConfig.from_pretrained(model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto",
    torch_dtype=torch.float16,  # Use half precision for inference
)
model = PeftModel.from_pretrained(base_model, model_path)

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

## Part 4 Load dataset

In [None]:
# Define dataset preprocess function
def preprocess_evaluation(data):
    '''
        Preprocess dataset for training and validation.
        Each data is a dict.
    '''

    # Pick question, options and answers from data
    question = data["question"]
    answer = data["answer_idx"]

    options = [key + ". " + val for key, val in data["options"].items()]
    options = "\n".join(options)

    # Concatenate information
    instruction = "Please answering the following question "        \
                    "by selecting the correct answer.\n\n"          \
                    f"Question:\n {question}\n\n"                   \
                    f"Options: {options}\n\n"                       \
                    "Provide only the letter of the correct answer."

    # Add prompt format
    instruction_formatted = "<|im_start|>user\n"                    \
                            f"{instruction} <|im_end|>\n"           \
                            "<|im_start|>assistant\n"

    return instruction_formatted, answer

In [None]:
# Load dataset
test_dataset = load_dataset(dataset_name, split="test")

README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

phrases_no_exclude_train.jsonl:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

phrases_no_exclude_test.jsonl:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

## Part 5 Evaluate

In [None]:
# Evaluate
y_pred = []
y_true = []

for test_data in test_dataset:
    # generate question and answer
    question, real_answer = preprocess_evaluation(test_data)

    # Tokenize input question
    inputs = tokenizer(question, return_tensors="pt").to(device)

    # Get model's output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=64, temperature=0.1)

    outputs = tokenizer.decode(outputs[0])
    outputs = outputs[len(question):]

    # Extract anwer from outputs:
    if "A" in outputs:
        predicted_answer = "A"
    elif "B" in outputs:
        predicted_answer = "B"
    elif "C" in outputs:
        predicted_answer = "C"
    elif "D" in outputs:
        predicted_answer = "D"
    else:
        predicted_answer = "Z"

    # print(outputs, real_answer)

    # Store answer to list
    y_pred.append(predicted_answer)
    y_true.append(real_answer)

In [None]:
cnt = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_true[i]:
        cnt += 1

print(f"Accuracy: {cnt / len(y_pred)}")

Accuracy: 0.38177533385703066


## Part 6 Vanilla Qwen

In [None]:
# Load vanilla model
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
)

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
# Evaluate
y_pred = []
y_true = []

for test_data in test_dataset:
    # generate question and answer
    question, real_answer = preprocess_evaluation(test_data)

    # Tokenize input question
    inputs = tokenizer(question, return_tensors="pt").to(device)

    # Get model's output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=64, temperature=0.1)

    outputs = tokenizer.decode(outputs[0])
    outputs = outputs[len(question):]

    # Extract anwer from outputs:
    if "A" in outputs:
        predicted_answer = "A"
    elif "B" in outputs:
        predicted_answer = "B"
    elif "C" in outputs:
        predicted_answer = "C"
    elif "D" in outputs:
        predicted_answer = "D"
    else:
        predicted_answer = "Z"

    # print(outputs, real_answer)

    # Store answer to list
    y_pred.append(predicted_answer)
    y_true.append(real_answer)

In [None]:
cnt = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_true[i]:
        cnt += 1

print(f"Accuracy: {cnt / len(y_pred)}")

Accuracy: 0.46190102120974075
