In [None]:
!pip install transformers



In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

# Load the MedQA dataset (US version)
dataset = load_dataset("VodLM/medqa", "tw")

# Check available splits
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['idx', 'uid', 'question', 'metamap', 'target', 'answers'],
        num_rows: 11298
    })
    validation: Dataset({
        features: ['idx', 'uid', 'question', 'metamap', 'target', 'answers'],
        num_rows: 1412
    })
    test: Dataset({
        features: ['idx', 'uid', 'question', 'metamap', 'target', 'answers'],
        num_rows: 1413
    })
})


In [None]:
train_data = dataset["train"]
print(train_data[0])  # Print the first data point



import pandas as pd

# Convert dataset to pandas DataFrame
train_data = dataset["train"].to_pandas()
test_data = dataset["test"].to_pandas()
validation_data = dataset["validation"].to_pandas()




{'idx': 0, 'uid': 'train-0', 'question': 'After the reaction physiology Which is not bedridden patients not moving (immobilization)?', 'metamap': '', 'target': 2, 'answers': ['Muscle atrophy', 'Weakness', 'Ligamentous laxity, increased ductility', 'Poor motor coordination']}


In [None]:
train_data.head()

Unnamed: 0,idx,uid,question,metamap,target,answers
0,0,train-0,After the reaction physiology Which is not bed...,,2,"[Muscle atrophy, Weakness, Ligamentous laxity,..."
1,1,train-1,"Humans, vitamin D3 is converted to 25-hydroxyc...",,2,"[skin, kidney, liver, parathyroid gland]"
2,2,train-2,Esotropia eye when doing alternate cover test ...,,1,"[In the exhibition (adduction), Abduction (Abd..."
3,3,train-3,"Suppose there is a popular city Influenza A, 1...",,2,"[0.1, 0.002, 0.2, 0.005]"
4,4,train-4,Healthcare nuclear emergency physician is noti...,,2,[To pre-hospital patient's death does not requ...


In [None]:
# with pd.ExcelWriter("dataset.xlsx", engine="openpyxl") as writer:
#     train_data.to_excel(writer, sheet_name="Train", index=False)
#     test_data.to_excel(writer, sheet_name="Test", index=False)
#     validation_data.to_excel(writer, sheet_name="Validation", index=False)

# print("Excel file 'dataset.xlsx' created with 3 sheets: Train, Test, and Validation.")

In [None]:
!pip install sacremoses



In [None]:
import random
import re
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import (
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# ---------------------------
# 1. Setup and Load Model
# ---------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "GanjinZero/biobart-v2-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()

# reproducibility
random.seed(42)
torch.manual_seed(42)

# real max input length (avoid sys.maxsize overflow)
max_input_len = model.config.max_position_embeddings  # e.g. 1024

# # ---------------------------
# # 2. Load MedQA Dataset
# # ---------------------------

# # you may need to `pip install datasets` first
# dataset = load_dataset("VodLM/medqa")  # :contentReference[oaicite:0]{index=0}
# train_data      = pd.DataFrame(dataset["train"])
# validation_data = pd.DataFrame(dataset["validation"])

# ---------------------------
# 3. Helper Functions
# ---------------------------

def format_example(question, options, reasoning_steps, final_answer):
    s = f"Question: {question}\nOptions:\n"
    for k, v in options.items():
        s += f"{k}. {v}\n"
    s += "Let's think step by step.\n"
    for step in reasoning_steps:
        s += f"{step}\n"
    s += f"Therefore, the answer is {final_answer}.\n"
    return s

def select_few_shot_examples(df, num_shots):
    rows = df.sample(n=num_shots * 3, random_state=42)
    examples = []
    seen = set()
    for _, row in rows.iterrows():
        opts = row["answers"]
        if isinstance(opts, str):
            opts = eval(opts)
        options = {chr(65 + i): opt for i, opt in enumerate(opts)}
        answer = chr(65 + int(row["target"]))
        if answer in seen and len(seen) < num_shots:
            continue
        seen.add(answer)
        reasoning = [
            "1. Identify key details in the question.",
            "2. Compare each option against those details.",
            "3. Choose the most clinically appropriate option."
        ]
        examples.append(format_example(row["question"], options, reasoning, answer))
        if len(examples) == num_shots:
            break
    random.shuffle(examples)
    return examples

def build_prompt(examples, question, options):
    prompt = "### Few‑Shot Examples ###\n"
    for i, ex in enumerate(examples, 1):
        prompt += f"Example {i}:\n{ex}\n"
    prompt += (
        "### New Question ###\n"
        "Please ignore the example answers above. "
        "For the question below, provide brief chain‑of‑thought reasoning, then state *only* the letter of your answer.\n\n"
    )
    prompt += f"Question: {question}\nOptions:\n"
    for k, v in options.items():
        prompt += f"{k}. {v}\n"
    prompt += "Let's think step by step.\n"
    return prompt

def parse_final_answer(text):
    m = re.search(r"therefore, the answer is\s*([A-D])", text, re.IGNORECASE)
    return m.group(1).upper() if m else None

def predict_with_prompt(n_shot, question, options):
    examples = select_few_shot_examples(train_data, n_shot)
    prompt   = build_prompt(examples, question, options)
    enc = tokenizer(
        prompt,
        return_tensors="pt",
        padding="longest",
        truncation=True,
        max_length=max_input_len,
    ).to(device)

    output_ids = model.generate(
        input_ids      = enc["input_ids"],
        attention_mask = enc["attention_mask"],
        max_length     = min(max_input_len, enc["input_ids"].shape[1] + 50),
        temperature    = 0.7,
        num_return_sequences = 1,
        do_sample      = True,
        pad_token_id   = tokenizer.pad_token_id,
        eos_token_id   = tokenizer.eos_token_id,
    )

    out  = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    pred = parse_final_answer(out)
    return pred or random.choice(list(options.keys()))

def score_with_likelihood(question, options):
    scores = {}
    for k, text in options.items():
        prompt = f"Question: {question}\nAnswer: {text}"
        enc = tokenizer(
            prompt,
            return_tensors="pt",
            padding="longest",
            truncation=True,
            max_length=max_input_len,
        ).to(device)
        with torch.no_grad():
            loss = model(**enc, labels=enc["input_ids"]).loss
        scores[k] = -loss.item()
    return max(scores, key=scores.get)

def ensemble_predict(question, options):
    preds = []
    for shot in (1, 2, 3):
        preds.append(predict_with_prompt(shot, question, options))
    preds.append(score_with_likelihood(question, options))
    from collections import Counter
    cnt, freq = Counter(preds).most_common(1)[0]
    return cnt if freq > 1 else preds[-1]

# ---------------------------
# 4. Run Ensemble & Evaluate
# ---------------------------

predictions, ground_truths = [], []

for _, row in tqdm(validation_data.head(250).iterrows(),
                  total=1000, desc="Ensembling"):
    q    = row["question"]
    opts = row["answers"]
    if isinstance(opts, str):
        opts = eval(opts)
    options = {chr(65+i): opt for i,opt in enumerate(opts)}
    ground_truths.append(chr(65 + int(row["target"])))
    predictions.append(ensemble_predict(q, options))

# Accuracy
accuracy = sum(p == t for p, t in zip(predictions, ground_truths)) / len(ground_truths)
print(f"Ensemble accuracy on 1000 examples: {accuracy:.2%}\n")

# Detailed metrics
print("Classification Report:\n")
print(classification_report(ground_truths, predictions, target_names=["A","B","C","D"]))
print(f"Macro‑avg Precision: {precision_score(ground_truths, predictions, average='macro'):.4f}")
print(f"Macro‑avg Recall:    {recall_score(ground_truths, predictions, average='macro'):.4f}")
print(f"Macro‑avg F1:        {f1_score(ground_truths, predictions, average='macro'):.4f}\n")

# Confusion matrix
cm = confusion_matrix(ground_truths, predictions, labels=["A","B","C","D"])
print("Confusion Matrix:")
print(cm)


Some weights of BartForCausalLM were not initialized from the model checkpoint at GanjinZero/biobart-v2-base and are newly initialized: ['lm_head.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Ensembling:  25%|██▌       | 250/1000 [08:09<24:29,  1.96s/it]

Ensemble accuracy on 1000 examples: 26.00%

Classification Report:

              precision    recall  f1-score   support

           A       0.33      0.06      0.10        54
           B       0.24      0.16      0.19        63
           C       0.33      0.07      0.12        69
           D       0.25      0.73      0.38        64

    accuracy                           0.26       250
   macro avg       0.29      0.26      0.20       250
weighted avg       0.29      0.26      0.20       250

Macro‑avg Precision: 0.2912
Macro‑avg Recall:    0.2553
Macro‑avg F1:        0.1960

Confusion Matrix:
[[ 3  7  2 42]
 [ 3 10  4 46]
 [ 2 12  5 50]
 [ 1 12  4 47]]



