In [1]:
import pandas as pd
import numpy as np
import pickle
import csv
import os
import time

In [2]:
from google.colab import drive, auth

In [3]:
print(auth.authenticate_user(project_id='mit-mlhc'))

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

None
Mounted at /content/drive


In [4]:
import gspread
from google.auth import default
creds, _ = default()

In [5]:
gc = gspread.authorize(creds)

In [6]:
path_to_worksheet = 'MCQ2.0'
worksheet = gc.open(path_to_worksheet).sheet1
rows = worksheet.get_all_values()
prompt_df = pd.DataFrame.from_records(rows)
prompt_df.columns = prompt_df.iloc[0]  # Set the first row as column names
prompt_df.drop(prompt_df.index[0], inplace=True)  # Drop the first row since it's now the header
prompt_df

Unnamed: 0,question_id,source,labels,topic,question,answer,explanation
1,1a,apdvs,"cs, lq",carotid disorders,A 75-year-old male smoke presents with recent ...,D,Right carotid endarterectomy. The patient has ...
2,1b,apdvs,"cs, sq",carotid disorders,A 75-year-old man had temporary vision loss in...,D,Right carotid endarterectomy. The patient has ...
3,2a,apdvs,"gk, lq",carotid disorders,What is the first muscle layer encountered in...,C,The platysma is the most superficial muscle in...
4,2b,apdvs,"gk, sq",carotid disorders,"During a carotid endarterectomy, which neck mu...",C,The platysma is the most superficial muscle in...
5,3a,apdvs,"gk, lq",lower extremity arterial disease,A 65-year old male with a history of hyperten...,B,Claudication is defined as reproducible pain o...
...,...,...,...,...,...,...,...
447,56.ten,case_studies,cs,venuous disease,A 19 year old female was brought in to the Eme...,C,
448,57.2,case_studies,gk,venuous disease,Which of the following statements regarding th...,"B, C, E",
449,57.3,case_studies,gk,venuous disease,Which of the following statements regarding th...,"A, B, D",
450,57.4,case_studies,gk,venuous disease,What are the long-term results of derivative a...,"A, C",


In [22]:
id_question = {}
for row in prompt_df.iterrows():
  id_question[row[1]["question_id"]] = row[1]

In [9]:
pre_prompts = {
1: "",
2: """You are a vascular surgeon responsible for teaching medical students. Provide concise, medically accurate, and ethically appropriate answers based on current standards of care. Focus only on clinical education relevant to surgery.""",
3: """You are a helpful medical knowledge assistant specializing in teaching vascular surgery to medical students. Provide useful, complete, and scientifically-grounded answers to clinical questions relevant to surgery and medical care.""",
4: """Answer the following multiple choice question from the medical domain based on these instructions. Choose one or more of the provided answer choices which are correct and output them under the heading "Answer".""",
5: """You are a helpful medical knowledge assistant specializing in teaching vascular surgery to medical students.
Answer the following multiple choice question from the medical domain based on the following instructions.
1. Select the one or more correct options and provide the option(s) under the heading "Answer".
2. Always select one or more of the provided options as the answer.
3. If the options are ambiguous or the question does not have enough context, select the ones that best answer the question.
4. If the question is not clinically relevant to vascular surgery, please respond with, "I'm sorry, I cannot answer this question." """
}

few_shot = """Question:
Which of the following is the most common site for peripheral arterial disease (PAD)?
A. Carotid artery
B. Femoral-popliteal artery
C. Renal artery
D. Subclavian artery
Answer:
B. Femoral-popliteal artery

Question:
Which of the following are common risk factors for developing an abdominal aortic aneurysm (AAA)?
A. Smoking
B. Male gender
C. Hypertension
D. Type 1 diabetes mellitus
E. Family history of AAA
Answer:
A. Smoking
B. Male gender
C. Hypertension
E. Family history of AAA
"""

pre_prompts[6] = few_shot

In [49]:
# final prompts
pre_prompts = {
1: "",
2: """You are a vascular surgeon responsible for teaching medical students. Provide concise, medically accurate, and ethically appropriate answers based on current standards of care. Focus only on clinical education relevant to surgery.""",
3: """You are a helpful medical knowledge assistant specializing in teaching vascular surgery to medical students. Provide useful, complete, and scientifically-grounded answers to clinical questions relevant to surgery and medical care.""",
4: """Answer the following multiple choice question from the medical domain based on these instructions. Choose one or more of the provided answer choices which are correct and output them under the heading "Answer".""",
5: """You are a helpful medical knowledge assistant specializing in teaching vascular surgery to medical students.
Answer the following multiple choice question from the medical domain based on the following instructions.
1. Select the one or more correct options and provide the option(s) under the heading "Answer".
2. Always select one or more of the provided options as the answer.
3. If the options are ambiguous or the question does not have enough context, select the ones that best answer the question.
4. If the question is not clinically relevant to vascular surgery, please respond with, "I'm sorry, I cannot answer this question." """
}

few_shot = """Question:
Which of the following is the most common site for peripheral arterial disease (PAD)?
A. Carotid artery
B. Femoral-popliteal artery
C. Renal artery
D. Subclavian artery
Answer:
B. Femoral-popliteal artery

Question:
Which of the following are common risk factors for developing an abdominal aortic aneurysm (AAA)?
A. Smoking
B. Male gender
C. Hypertension
D. Type 1 diabetes mellitus
E. Family history of AAA
Answer:
A. Smoking
B. Male gender
C. Hypertension
E. Family history of AAA
"""

pre_prompts[6] = few_shot

In [None]:
print(f'{changed_qids = }')

changed_qids = ['5a', '75', '90', '369', '399', '402', '404', '433', '14.6', '18.2', '18.3', '18.4', '18.5', '18.7', '18.8', '18.9', '18.ten', '20.1', '21.5', '27.1', '27.5', '27.6', '27.8', '27.9', '27.ten', '27.11', '32.1', '32.7', '37.1', '37.2', '37.3', '42.5', '43.9', '43.ten', '46.5', '48.5', '49.5', '49.8', '49.9', '53.5']


In [45]:
models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2", "microsoft/Phi-4-mini-instruct", "gemini-2.0-flash-001"]

In [None]:
p_toks = {}
model_toks = {}
model_name = models[0]
for model_name, tokenizer in model_tok.items():
  for prompt in id_question.values():
    key = (prompt["prompt_id"], prompt["question_id"], model_name)
    query = prompt["query"]['content']
    tokens = tokenizer(query)
    print(tokens)
    num_toks = len(tokens['input_ids'])
    p_toks[key] = (tokens, num_toks)
    model_toks.setdefault(model_name, []).append(num_toks)





Output hidden; open in https://colab.research.google.com to view.

In [None]:
worksheet = gc.open('My cool spreadsheet').sheet1
worksheet.append_row(["prompt_id", "question_id", "model", "answer", "response", "correct"])

{'spreadsheetId': '1kUPTZMyOBMOp6C8EGnCybytruMc-A3tGPh-Kfxpo2nU',
 'updates': {'spreadsheetId': '1kUPTZMyOBMOp6C8EGnCybytruMc-A3tGPh-Kfxpo2nU',
  'updatedRange': 'Sheet1!A1:F1',
  'updatedRows': 1,
  'updatedColumns': 6,
  'updatedCells': 6}}

In [12]:
path_to_worksheet = 'MCQ2.0'
worksheet = gc.open(path_to_worksheet).get_worksheet(4)
rows = worksheet.get_all_values()
eval_df = pd.DataFrame.from_records(rows)
eval_df.columns = eval_df.iloc[0]  # Set the first row as column names
eval_df.drop(eval_df.index[0], inplace=True)  # Drop the first row since it's now the header
eval_df

Unnamed: 0,prompt_id,question_id,model,answer,response,correct
1,2,10a,FreedomIntelligence/Apollo-0.5B,A. Using anatomic landmarks to confirm cannula...,### Instructions\nYou are a vascular surgeon r...,Y
2,3,10a,FreedomIntelligence/Apollo-0.5B,A. Using anatomic landmarks to confirm cannula...,### Correct Answer: D. Using the Seldinger tec...,N
3,4,10a,FreedomIntelligence/Apollo-0.5B,A. Using anatomic landmarks to confirm cannula...,D. Using the Seldinger technique for first acc...,N
4,5,10a,FreedomIntelligence/Apollo-0.5B,A. Using anatomic landmarks to confirm cannula...,### Correct Answer:\nD. Using the Seldinger te...,N
5,6,10a,FreedomIntelligence/Apollo-0.5B,A. Using anatomic landmarks to confirm cannula...,### Instructions\nQuestion:\nWhich of the foll...,N
...,...,...,...,...,...,...
2550,6,9b,skumar9/Llama-medx_v3.2,C. Consider endovenous ablation of refluxing s...,B Perform surgical ligation of perforator vein...,N
2551,5,9b,skumar9/Llama-medx_v3.2,C. Consider endovenous ablation of refluxing s...,C Consider endovenous ablation of refluxing su...,Y
2552,2,9b,skumar9/Llama-medx_v3.2,C. Consider endovenous ablation of refluxing s...,C Consider endovenous ablation of refluxing su...,Y
2553,3,9b,skumar9/Llama-medx_v3.2,C. Consider endovenous ablation of refluxing s...,C Consider endovenous ablation of refluxing su...,Y


In [14]:
# Do some stats stuff

path_to_worksheet = 'MCQ2.0'
worksheet = gc.open(path_to_worksheet).get_worksheet(3)
rows = worksheet.get_all_values()
eval_df2 = pd.DataFrame.from_records(rows)
eval_df2.columns = eval_df2.iloc[0]  # Set the first row as column names
eval_df2.drop(eval_df2.index[0], inplace=True)  # Drop the first row since it's now the header
eval_df2

Unnamed: 0,prompt_id,question_id,model,answer,response,correct
1,1,423,skumar9/Llama-medx_v3.2,A. Aortic stent graft.\n B. PTA and stenting o...,"D Extra-anatomic revascularization, e.g. femor...",N
2,1,28.3,skumar9/Llama-medx_v3.2,A. The best patency can be achieved using a ve...,D Conservative treatment with long term chroni...,N
3,1,56.8,skumar9/Llama-medx_v3.2,B. Compression bandage,B Compression bandage [A],Y
4,1,75,skumar9/Llama-medx_v3.2,D. Rupture of alveolar bleb,A Lung collapse [A],N
5,1,30.5,skumar9/Llama-medx_v3.2,A. Regular clinic review with attention to med...,A Regular clinic review with attention to medi...,Y
...,...,...,...,...,...,...
2251,1,35.4,gemini-2.0-flash-001,"C. When the carotid body tumor is growing, enc...",Let's analyze each statement:\n \n * **A. A Sh...,N
2252,1,373,gemini-2.0-flash-001,C. Transfer to the operating room for emergent...,The correct answer is **C. Transfer to the ope...,Y
2253,1,11a,gemini-2.0-flash-001,C. Transfer to the operating room for emergent...,The correct answer is **C. Transfer to the ope...,Y
2254,1,57.3,gemini-2.0-flash-001,A. Lymphangiography is currently the best diag...,Let's analyze each statement:\n \n * **A. Lymp...,N


In [15]:
model_qids = {}
model_prompts = {}
model_keys = {}
all_prompt_qs = set()

key_row = {}
for idx, row in eval_df.iterrows():
  assert row["correct"] == "Y" or row["correct"] == "N"
  mod = row["model"]
  qid = row["question_id"]
  model_qids.setdefault(mod, set()).add(qid)
  model_prompts.setdefault(mod, set()).add(row["prompt_id"])
  model_keys.setdefault(mod, set()).add((qid, row["prompt_id"]))
  key_row[(row["prompt_id"], qid, mod)] = row
  if row["prompt_id"] != "1":
    all_prompt_qs.add(row["question_id"])

for idx, row in eval_df2.iterrows():
  assert row["correct"] == "Y" or row["correct"] == "N", f'{row.to_dict()}'
  mod = row["model"]
  qid = row["question_id"]
  model_qids.setdefault(mod, set()).add(qid)
  model_prompts.setdefault(mod, set()).add(row["prompt_id"])
  model_keys.setdefault(mod, set()).add((qid, row["prompt_id"]))
  key_row[(row["prompt_id"], qid, mod)] = row

for model in model_qids:
  print(model, len(model_qids[model]))
  print(model, len(model_prompts[model]))
  print(model, len(model_keys[model]))

FreedomIntelligence/Apollo-0.5B 451
FreedomIntelligence/Apollo-0.5B 6
FreedomIntelligence/Apollo-0.5B 1286
johnsnowlabs/JSL-MedPhi2-2.7B 451
johnsnowlabs/JSL-MedPhi2-2.7B 6
johnsnowlabs/JSL-MedPhi2-2.7B 1286
skumar9/Llama-medx_v3.2 451
skumar9/Llama-medx_v3.2 6
skumar9/Llama-medx_v3.2 1286
microsoft/Phi-4-mini-instruct 451
microsoft/Phi-4-mini-instruct 1
microsoft/Phi-4-mini-instruct 451
gemini-2.0-flash-001 451
gemini-2.0-flash-001 1
gemini-2.0-flash-001 451


In [18]:
# overall accuracy
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
columns = ["Model", "Num Correct", "Num Questions", "% Accuracy"]
rows = []



for model in models:
  res = [row["correct"] == "Y" for row in key_row.values() if row["model"] == model]
  rows.append([model, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,233,1286,18.1
1,johnsnowlabs/JSL-MedPhi2-2.7B,432,1286,33.6
2,skumar9/Llama-medx_v3.2,490,1286,38.1
3,microsoft/Phi-4-mini-instruct,195,451,43.2
4,gemini-2.0-flash-001,235,451,52.1


In [19]:
# base accuracy
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
columns = ["Model", "Num Correct", "Num Questions", "% Accuracy"]
rows = []

for model in models:
  res = [row["correct"] == "Y" for row in key_row.values() if row["model"] == model and row["prompt_id"]=="1"]
  rows.append([model, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,109,451,24.2
1,johnsnowlabs/JSL-MedPhi2-2.7B,152,451,33.7
2,skumar9/Llama-medx_v3.2,162,451,35.9
3,microsoft/Phi-4-mini-instruct,195,451,43.2
4,gemini-2.0-flash-001,235,451,52.1


In [20]:
# make table
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
columns = ["Model", "Prompt", "Num Correct", "Num Questions", "% Accuracy"]
rows = []

for model in models:
  for prompt in pre_prompts:
    res = [row["correct"] == "Y" for row in key_row.values()
    if row["model"] == model and row["prompt_id"] == str(prompt) and row["question_id"] in all_prompt_qs]
    rows.append([model, prompt, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,Prompt,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,1,35,167,21.0
1,FreedomIntelligence/Apollo-0.5B,2,29,167,17.4
2,FreedomIntelligence/Apollo-0.5B,3,32,167,19.2
3,FreedomIntelligence/Apollo-0.5B,4,35,167,21.0
4,FreedomIntelligence/Apollo-0.5B,5,27,167,16.2
5,FreedomIntelligence/Apollo-0.5B,6,1,167,0.6
6,johnsnowlabs/JSL-MedPhi2-2.7B,1,52,167,31.1
7,johnsnowlabs/JSL-MedPhi2-2.7B,2,54,167,32.3
8,johnsnowlabs/JSL-MedPhi2-2.7B,3,56,167,33.5
9,johnsnowlabs/JSL-MedPhi2-2.7B,4,59,167,35.3


In [38]:
columns = ["Model", "Source", "Num Correct", "Num Questions", "% Accuracy"]
rows = []
for source in sources:
  for model in models:


    res = [row["correct"] == "Y" for row in key_row.values()
    if row["model"] == model and id_question[row["question_id"]]["source"] == source and row["prompt_id"] == "1"]
    rows.append([model, source, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,Source,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,case_studies,77,317,24.3
1,johnsnowlabs/JSL-MedPhi2-2.7B,case_studies,102,317,32.2
2,skumar9/Llama-medx_v3.2,case_studies,109,317,34.4
3,microsoft/Phi-4-mini-instruct,case_studies,133,317,42.0
4,gemini-2.0-flash-001,case_studies,142,317,44.8
5,FreedomIntelligence/Apollo-0.5B,apdvs,26,61,42.6
6,johnsnowlabs/JSL-MedPhi2-2.7B,apdvs,34,61,55.7
7,skumar9/Llama-medx_v3.2,apdvs,39,61,63.9
8,microsoft/Phi-4-mini-instruct,apdvs,43,61,70.5
9,gemini-2.0-flash-001,apdvs,48,61,78.7


In [33]:
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
raw_tags = []
[raw_tags.extend([x.strip() for x in q["labels"].split(",")]) for q in id_question.values()]
tags = sorted(list(set(raw_tags)), key=lambda x: raw_tags.count(x), reverse=True)
print(tags)
columns = ["Model", "Tag", "Num Correct", "Num Questions", "% Accuracy"]
rows = []
for tag in tags:
  for model in models:

    res = [row["correct"] == "Y" for row in key_row.values()
    if row["model"] == model and tag in id_question[row["question_id"]]["labels"] and row["prompt_id"] == "1"]
    rows.append([model, tag, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

['gk', 'cs', 'sq', 'lq', 'fu']


Unnamed: 0,Model,Tag,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,gk,40,235,17.0
1,johnsnowlabs/JSL-MedPhi2-2.7B,gk,62,235,26.4
2,skumar9/Llama-medx_v3.2,gk,65,235,27.7
3,FreedomIntelligence/Apollo-0.5B,cs,69,216,31.9
4,johnsnowlabs/JSL-MedPhi2-2.7B,cs,90,216,41.7
5,skumar9/Llama-medx_v3.2,cs,97,216,44.9
6,FreedomIntelligence/Apollo-0.5B,sq,7,13,53.8
7,johnsnowlabs/JSL-MedPhi2-2.7B,sq,9,13,69.2
8,skumar9/Llama-medx_v3.2,sq,8,13,61.5
9,FreedomIntelligence/Apollo-0.5B,lq,7,13,53.8


In [30]:
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
raw_topics = []
[raw_topics.extend([x.strip() for x in q["topic"].split(",")]) for q in id_question.values()]
topics = sorted(list(set(raw_topics)), key=lambda x: raw_topics.count(x), reverse=True)
print(topics)
columns = ["Model", "Topic", "Num Correct", "Num Questions", "% Accuracy"]
rows = []
for topic in topics:
  for model in models:

    res = [row["correct"] == "Y" for row in key_row.values()
    if row["model"] == model and topic == id_question[row["question_id"]]["topic"] and row["prompt_id"] == "1"]
    rows.append([model, topic, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

['lower extremity arterial disease', 'venuous disease', 'aortic disorders', 'other', 'carotid disorders', 'endovascular surgery']


Unnamed: 0,Model,Topic,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,lower extremity arterial disease,35,128,27.3
1,johnsnowlabs/JSL-MedPhi2-2.7B,lower extremity arterial disease,48,128,37.5
2,skumar9/Llama-medx_v3.2,lower extremity arterial disease,55,128,43.0
3,microsoft/Phi-4-mini-instruct,lower extremity arterial disease,54,128,42.2
4,gemini-2.0-flash-001,lower extremity arterial disease,71,128,55.5
5,FreedomIntelligence/Apollo-0.5B,venuous disease,35,126,27.8
6,johnsnowlabs/JSL-MedPhi2-2.7B,venuous disease,42,126,33.3
7,skumar9/Llama-medx_v3.2,venuous disease,42,126,33.3
8,microsoft/Phi-4-mini-instruct,venuous disease,56,126,44.4
9,gemini-2.0-flash-001,venuous disease,67,126,53.2


In [39]:
columns = ["Model", "source", "Tag", "Num Correct", "Num Questions", "% Accuracy"]
rows = []

for source in sources:
  for tag in tags:
    for model in models:
      if tag not in "gk, cs":
        continue
      res = [row["correct"] == "Y" for row in key_row.values()
      if row["model"] == model  and id_question[row["question_id"]]["source"] == source and tag in id_question[row["question_id"]]["labels"] and row["prompt_id"] == "1"]
      if not res:
        continue
      rows.append([model, source, tag, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,source,Tag,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,case_studies,gk,28,155,18.1
1,johnsnowlabs/JSL-MedPhi2-2.7B,case_studies,gk,43,155,27.7
2,skumar9/Llama-medx_v3.2,case_studies,gk,48,155,31.0
3,microsoft/Phi-4-mini-instruct,case_studies,gk,62,155,40.0
4,gemini-2.0-flash-001,case_studies,gk,66,155,42.6
5,FreedomIntelligence/Apollo-0.5B,case_studies,cs,49,162,30.2
6,johnsnowlabs/JSL-MedPhi2-2.7B,case_studies,cs,59,162,36.4
7,skumar9/Llama-medx_v3.2,case_studies,cs,61,162,37.7
8,microsoft/Phi-4-mini-instruct,case_studies,cs,71,162,43.8
9,gemini-2.0-flash-001,case_studies,cs,76,162,46.9


In [43]:
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
#raw_tags = []
#[raw_tags.extend([x.strip() for x in q["tags"]]) for q in id_question.values()]
#tags = sorted(list(set(raw_tags)), key=lambda x: raw_tags.count(x), reverse=True)
#print(tags)
columns = ["Model", "source", "Tag", "#c>1", "Num Correct", "Num Questions", "% Accuracy"]
rows = []
for model in models:
  for source in sources:
    for tag in tags:
      if tag not in "gk, cs":
        continue
      res = [row["correct"] == "Y" for row in key_row.values()
      if row["model"] == model  and  len(id_question[row["question_id"]]["answer"].split(",")) == 1 and
        id_question[row["question_id"]]["source"] == source and tag in id_question[row["question_id"]]["labels"] and row["prompt_id"] == "1"]
      if res:
        rows.append([model, source, tag, "1", sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
      res = [row["correct"] == "Y" for row in key_row.values()
      if row["model"] == model  and  len(id_question[row["question_id"]]["answer"].split(",")) > 1 and
        id_question[row["question_id"]]["source"] == source and tag in id_question[row["question_id"]]["labels"] and row["prompt_id"] == "1"]
      if res:
        rows.append([model, source, tag, ">1", sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,source,Tag,#c>1,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,case_studies,gk,1,24,74,32.4
1,FreedomIntelligence/Apollo-0.5B,case_studies,gk,>1,4,81,4.9
2,FreedomIntelligence/Apollo-0.5B,case_studies,cs,1,47,109,43.1
3,FreedomIntelligence/Apollo-0.5B,case_studies,cs,>1,2,53,3.8
4,FreedomIntelligence/Apollo-0.5B,apdvs,gk,1,10,20,50.0
5,FreedomIntelligence/Apollo-0.5B,apdvs,cs,1,16,41,39.0
6,FreedomIntelligence/Apollo-0.5B,racs,gk,1,1,5,20.0
7,FreedomIntelligence/Apollo-0.5B,racs,gk,>1,1,51,2.0
8,FreedomIntelligence/Apollo-0.5B,prep,cs,1,4,13,30.8
9,FreedomIntelligence/Apollo-0.5B,abvm,gk,1,0,3,0.0


In [55]:

raw_num_answers = [len(q["answer"].split(",")) for q in id_question.values() if any(k[1] == q["question_id"] for k in key_row)]
num_answers = sorted(list(set(raw_num_answers)), key=lambda x: raw_num_answers.count(x), reverse=True)
columns = ["Model", "Num Answers", "Num Q Correct", "Num Questions", "% Accuracy"]
rows = []
for model in models:
  for num in num_answers:
    res = [row["correct"] == "Y" for row in key_row.values()
    if row["model"] == model and len(id_question[row["question_id"]]["answer"].split(",")) == num and row["prompt_id"] == "1"]
    rows.append([model, num, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,Num Answers,Num Q Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,1,102,265,38.5
1,FreedomIntelligence/Apollo-0.5B,2,0,78,0.0
2,FreedomIntelligence/Apollo-0.5B,3,0,57,0.0
3,FreedomIntelligence/Apollo-0.5B,4,5,36,13.9
4,FreedomIntelligence/Apollo-0.5B,5,2,12,16.7
5,FreedomIntelligence/Apollo-0.5B,7,0,2,0.0
6,FreedomIntelligence/Apollo-0.5B,6,0,1,0.0
7,johnsnowlabs/JSL-MedPhi2-2.7B,1,131,265,49.4
8,johnsnowlabs/JSL-MedPhi2-2.7B,2,9,78,11.5
9,johnsnowlabs/JSL-MedPhi2-2.7B,3,8,57,14.0


In [47]:
# filter to single correct answer, no paper specific questions
#models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
columns = ["Model", "Num Correct", "Num Questions", "% Accuracy"]
rows = []
for model in models:
  res = [row["correct"] == "Y" for row in key_row.values()
  if (row["model"] == model and len(id_question[row["question_id"]]["answer"].split(",")) == 1 and row["prompt_id"] == "1")]
  rows.append([model, sum(res), len(res), round(100*sum(res)/len(res) if res else 0, 1)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Model,Num Correct,Num Questions,% Accuracy
0,FreedomIntelligence/Apollo-0.5B,102,265,38.5
1,johnsnowlabs/JSL-MedPhi2-2.7B,131,265,49.4
2,skumar9/Llama-medx_v3.2,155,265,58.5
3,microsoft/Phi-4-mini-instruct,159,265,60.0
4,gemini-2.0-flash-001,174,265,65.7


In [52]:
for id, prompt in pre_prompts.items():
  #print(id)
  print(prompt)
  print("")



You are a vascular surgeon responsible for teaching medical students. Provide concise, medically accurate, and ethically appropriate answers based on current standards of care. Focus only on clinical education relevant to surgery.

You are a helpful medical knowledge assistant specializing in teaching vascular surgery to medical students. Provide useful, complete, and scientifically-grounded answers to clinical questions relevant to surgery and medical care.

Answer the following multiple choice question from the medical domain based on these instructions. Choose one or more of the provided answer choices which are correct and output them under the heading "Answer".

You are a helpful medical knowledge assistant specializing in teaching vascular surgery to medical students.
Answer the following multiple choice question from the medical domain based on the following instructions.
1. Select the one or more correct options and provide the option(s) under the heading "Answer".
2. Always 

In [None]:
models = ["FreedomIntelligence/Apollo-0.5B", "johnsnowlabs/JSL-MedPhi2-2.7B", "skumar9/Llama-medx_v3.2"]
raw_source = [q["source"] for q in id_question.values()]
sources = sorted(list(set(raw_source)), key=lambda x: raw_source.count(x), reverse=True)

columns = ["Source", "Tag", "Num Questions"]
rows = []
for source in sources:
  for tag in [''] + tags:
    res = [row for row in key_row.values()
    if id_question[row["question_id"]]["source"] == source and row["prompt_id"] == "1" and row["model"] == models[0] and tag in " ".join(id_question[row["question_id"]]["tags"])]
    rows.append([source, tag, len(res)])
pd.DataFrame(rows, columns=columns)

Unnamed: 0,Source,Tag,Num Questions
0,case_studies,,317
1,case_studies,gk,155
2,case_studies,cs,162
3,case_studies,lq,0
4,case_studies,sq,0
5,case_studies,fu,0
6,apdvs,,61
7,apdvs,gk,20
8,apdvs,cs,41
9,apdvs,lq,13
