In [1]:
!pip install torch datasets transformers peft optimum auto-gptq tqdm

import torch
import requests
import json
import re
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from datasets import load_dataset, Dataset
from tqdm import tqdm
import os

Collecting optimum
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.3.0-py3-none-any.whl.metadata (3.0 kB)
Downloading optimum-1.24.0-py3-none-any.whl (433 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.6/433.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading gekko-1.3.0-py3-none-any.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m89.9 MB/s[0m eta [36m0:00

In [2]:
login(token="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")

In [3]:
# Device setup
device = torch.device("cuda")

In [6]:
# Load fine-tuned Qwen model
model_name = "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
fine_tuned_model = PeftModel.from_pretrained(base_model, "/kaggle/input/qwen_finetuned_lora_quantized/pytorch/default/1/qwen_finetuned_lora_quantized/final").to(device)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [7]:
# Load evaluator model
evaluator_name = "Qwen/Qwen2.5-3B-Instruct"
eval_tokenizer = AutoTokenizer.from_pretrained(evaluator_name)
eval_model = AutoModelForCausalLM.from_pretrained(evaluator_name, torch_dtype=torch.bfloat16).to(device)
if eval_tokenizer.pad_token is None:
    eval_tokenizer.pad_token = eval_tokenizer.eos_token
eval_tokenizer.padding_side = "left"

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [8]:
# Load full dataset
def load_medical_dataset():
    dataset = load_dataset("TsinghuaC3I/UltraMedical-Preference", split="train")
    return dataset

In [9]:
# Extract question and correct answer
def extract_question_answer(row):
    question = row["prompt"]
    correct_answer = next(entry["content"] for entry in row["chosen"] if entry["role"] == "assistant")
    return question, correct_answer

In [10]:
# Generate answers from fine-tuned model
def generate_answers_finetuned(question):
    prompt_messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question}
    ]
    prompt_text = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True).to(device)
    attention_mask = inputs.attention_mask
    answers = []
    for _ in range(2):
        with torch.no_grad():
            output = fine_tuned_model.generate(
                inputs.input_ids,
                attention_mask=attention_mask,
                max_new_tokens=512,
                do_sample=True,
                top_p=0.95,
                temperature=0.7
            )
        full_response = tokenizer.decode(output[0], skip_special_tokens=True)
        assistant_response = full_response.split("assistant")[-1].strip() if "assistant" in full_response else full_response.strip()
        answers.append(assistant_response)
    return answers

In [11]:
def rank_answers(question, correct_answer, generated_answers):
    prompt = (
        f"Question: {question}\n"
        f"Correct Answer: {correct_answer}\n\n" +
        "For each of the following generated answers, assign a score between 0 and 1 (where 1 is perfectly correct)" +
        "based on how well it matches the correct answer. Consider factual accuracy, completeness, and key points." +
        "Even partially correct answers should receive a score above 0. Use decimals (e.g., 0.3, 0.7) as needed.\n\n"
        f"Generated Answer 0: {generated_answers[0]}\n\n"
        f"Generated Answer 1: {generated_answers[1]}\n\n"
        "Output only the scores for each generated answer in this exact format:\n"
        "Generated Answer 0 score: \n"
        "Generated Answer 1 score: \n"
    )
    inputs = eval_tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    attention_mask = inputs.attention_mask
    with torch.no_grad():
        outputs = eval_model.generate(
            inputs.input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1000,
            do_sample=True,
            top_p=0.95,
            temperature=0.8
        )
    return eval_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [12]:
# Parse evaluator output safely
def safe_parse_evaluator_output(eval_output):
    score_dict = {}
    pattern = r"Generated Answer (\d+) score:\s*([0-9]*\.?[0-9]+)"
    matches = re.findall(pattern, eval_output)
    for idx, score_str in matches:
        try:
            score_dict[int(idx)] = float(score_str)
        except ValueError:
            continue
    return score_dict

In [15]:
def evaluate_finetuned_model():

    dataset = load_medical_dataset()
    print(f"Loaded dataset with {len(dataset)} rows")
    
    start_row = 0
    end_row = min(250, len(dataset))
    max_rows = end_row - start_row

    # Lists to store results
    questions = []
    correct_answers = []
    generated_answers_list = []
    scores_list = []

    for i in tqdm(range(start_row, end_row), desc="Evaluating rows"):
        row = dataset[i]
        question, correct_answer = extract_question_answer(row)

        generated_answers = generate_answers_finetuned(question)

        eval_output = rank_answers(question, correct_answer, generated_answers)
        scores = safe_parse_evaluator_output(eval_output)

        questions.append(question)
        correct_answers.append(correct_answer)
        generated_answers_list.append(generated_answers)
        scores_list.append(scores)

    evaluation_dataset = Dataset.from_dict({
        "question": questions,
        "correct_answer": correct_answers,
        "generated_answers": generated_answers_list,
        "scores": scores_list
    })
    return evaluation_dataset

In [None]:
evaluation_dataset = evaluate_finetuned_model()

Loaded dataset with 109353 rows


Evaluating rows:   0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
# Save dataset locally
output_dir = "/kaggle/working/dataset"
os.makedirs(output_dir, exist_ok=True)
evaluation_dataset.save_to_disk(output_dir)
print(f"Evaluation dataset saved to {output_dir}")