#### MedMCQA

In [1]:
import os
import json

from datasets import load_dataset

ds = load_dataset("openlifescienceai/medmcqa")

print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 4183
    })
})


In [2]:
data = ds['validation']

final_data = []
for index, item in enumerate(data):
    id = item['id']
    question = item['question'] + "\nAnswer Choices: (A) " + item['opa'] + " (B) " + item['opb'] + " (C) " + item['opc'] + " (D) " + item['opd']
    label = item['cop']
    if label == 0:
        label = ["A"]
    elif label == 1:
        label = ["B"]
    elif label == 2:
        label = ["C"]
    elif label == 3:
        label = ["D"]
    else:
        raise ValueError("Invalid label")
    label_rationale = item['exp']
    subject_name = item['subject_name']
    topic_name = item['topic_name']
    final_data.append({"id": id, "question": question, "label": label, "label_rationale": label_rationale, "subject_name": subject_name, "topic_name": topic_name})

output_path = "medmcqa/input/medmcqa_input.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

#### MedQA

In [3]:
ds = load_dataset("GBaker/MedQA-USMLE-4-options-hf")

print(ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'sent1', 'sent2', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 10178
    })
    validation: Dataset({
        features: ['id', 'sent1', 'sent2', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 1272
    })
    test: Dataset({
        features: ['id', 'sent1', 'sent2', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 1273
    })
})


In [4]:
data = ds['test']

final_data = []
for index, item in enumerate(data):
    id = item['id']
    question = item['sent1'] + "\nAnswer Choices: (A) " + item['ending0'] + " (B) " + item['ending1'] + " (C) " + item['ending2'] + " (D) " + item['ending3']
    label = item['label']
    if label == 0:
        label = ["A"]
    elif label == 1:
        label = ["B"]
    elif label == 2:
        label = ["C"]
    elif label == 3:
        label = ["D"]
    else:
        raise ValueError("Invalid label")
    final_data.append({"id": id, "question": question, "label": label})

output_path = "medqa/input/medqa_input.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

#### MMLU Medical

In [5]:
subject_list = ["anatomy", "clinical_knowledge", "college_biology", "college_medicine", "medical_genetics", "professional_medicine"]
data = []
for subject in subject_list:
    data.append(load_dataset("cais/mmlu", subject))

In [6]:
final_data = []
for ds in data:
    samples = ds['test']
    for index, item in enumerate(samples):
        assert len(item['choices']) == 4
        question = item['question'] + "\nAnswer Choices: (A) " + item['choices'][0] + " (B) " + item['choices'][1] + " (C) " + item['choices'][2] + " (D) " + item['choices'][3]
        label = item['answer']
        if label == 0:
            label = ["A"]
        elif label == 1:
            label = ["B"]
        elif label == 2:
            label = ["C"]
        elif label == 3:
            label = ["D"]
        else:
            raise ValueError("Invalid label")
        subject = item['subject']
        final_data.append({"question": question, "label": label, "subject": subject})

output_path = "mmlu_medical/input/mmlu_medical_input.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    for item in final_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")