In [19]:
import pandas as pd
import datasets
import transformers

In [8]:
# load medical rag dataset from HF
dataset = datasets.load_dataset("MedRAG/textbooks")
df = dataset['train'].to_pandas()

In [49]:
model = transformers.AutoModelForCausalLM.from_pretrained("google/gemma-2b").to('mps')

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.03s/it]


In [50]:
import requests
import tqdm
import json

url = "https://raw.githubusercontent.com/Teddy-XiongGZ/MIRAGE/main/benchmark.json"
r = requests.get(url, stream=True)
with open("benchmark.json", "wb") as f:
    pbar = tqdm.tqdm(unit="B", total=int(r.headers['Content-Length']))
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            pbar.update(len(chunk))
            f.write(chunk)

with open("benchmark.json", "r") as f:
    benchmark = json.load(f)

4296735B [15:01, 4764.21B/s]              


In [51]:
rows = []

for dataset, data in benchmark.items():
  for k, v in data.items():
    question = v['question']
    options = v['options']
    answer = v['answer']

    rows.append({
        'dataset': dataset,
        'question': question,
        'options': options,
        'answer': answer
    })

benchmark_df = pd.DataFrame(rows)
benchmark_df

Unnamed: 0,dataset,question,options,answer
0,medqa,A junior orthopaedic surgery resident is compl...,{'A': 'Disclose the error to the patient and p...,B
1,medqa,A 67-year-old man with transitional cell carci...,"{'A': 'Inhibition of proteasome', 'B': 'Hypers...",D
2,medqa,Two weeks after undergoing an emergency cardia...,"{'A': 'Renal papillary necrosis', 'B': 'Choles...",B
3,medqa,A 39-year-old woman is brought to the emergenc...,"{'A': 'Coagulase-positive, gram-positive cocci...",D
4,medqa,A 35-year-old man comes to the physician becau...,"{'A': 'Erythromycin ointment', 'B': 'Ketotifen...",B
...,...,...,...,...
7658,mmlu,A 64-year-old female presents to the office wi...,"{'A': 'amyotrophic lateral sclerosis', 'B': 'F...",A
7659,mmlu,Four days after undergoing open reduction and ...,"{'A': 'Adverse effect of medication', 'B': 'Al...",B
7660,mmlu,A 57-year-old woman comes to the physician bec...,"{'A': 'Donepezil therapy', 'B': 'Ferrous sulfa...",D
7661,mmlu,A 40-year-old man with paranoid schizophrenia ...,{'A': 'Administration of ipecac to induce vomi...,D


In [53]:
# make a multiple-choice question from a row.
def make_question(row):
    question = row["question"]
    options = row["options"]
    options = "\n".join(f"{key}: {value}" for key, value in options.items())
    return f"{question}\n\n{options}"


print(make_question(benchmark_df.iloc[0]))

system_prompt = (
    "You are a helpful medical expert, and your task is to answer a multi-choice medical question "
    "Please first think step-by-step and then choose the answer from the provided options."
    "Your responses will be used for research purposes only, so please have a definite answer."
    'Your output should be of the form "The answer is [A/B/C/D/E]."'
)

# ask gemma a question
tokenizer = transformers.AutoTokenizer.from_pretrained("google/gemma-2b")
tokens = tokenizer([system_prompt + make_question(benchmark_df.iloc[0])], return_tensors="pt").to('mps')
output = model.generate(**tokens, max_length=512)

A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?

A: Disclose the error to the patient and put it in the operative report
B: Tell the attending that he cannot fail to disclose this mistake
C: Report the physician to the ethics committee
D: Refuse to dictate the operative report


In [54]:
tokenizer.decode(output[0])

'<bos>You are a helpful medical expert, and your task is to answer a multi-choice medical question using the relevant documents. Please first think step-by-step and then choose the answer from the provided options.Your final output should be of the form, "The answer is [A/B/C/D/E]."Your responses will be used for research purposes only, so please have a definite answer.A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?\n\nA: Disclose the 