In [1]:
import subprocess
import os
from datasets import load_dataset, Dataset, interleave_datasets, concatenate_datasets
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

import re
# Load and prep dataset
SYSTEM_PROMPT = {
    "en": """Respond with <think>your reasoning process</think> followed by <answer>your final answer</answer>. 
    The response part should be as concise as possible: 
    - For math problems, only provide the numerical result
    - For judgment questions, only answer yes/no/maybe
    - For multiple-choice questions, only provide the letter (A/B/C/D)
    When not in thinking mode, only provide the content within <answer> tags""",
    
    "zh": """请用<think>你的思考过程</think>和<answer>最终答案</answer>的格式回答。
    其中response部分填写尽量简短：
    - 数学题只需要回答数字结果
    - 判断题只需要回答yes/no/maybe
    - 选择题只需要填写A/B/C/D
    非思考模式时，只需要提供<answer>标签包含的内容"""
}

XML_COT_FORMAT = """\
<think>
{reasoning}
</think>
<answer>
{answer}
</answer>
"""

# def extract_xml_answer(text: str) -> str:
#     answer = text.split("<answer>")[-1]
#     answer = answer.split("</answer>")[0]
#     return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_datasets(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main',cache_dir='data/gsm8k')[split] # type: ignore
    data = data.select(range(5000)).map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT['en']},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer']),
        'db_set':'gsm8k'
    }) # type: ignore
    data = data.remove_columns(['question'])

    data_qa = load_dataset("qiaojin/PubMedQA", "pqa_artificial",cache_dir='data/PubMedQA')[split] # two times more than other datasets
    data_qa = data_qa.filter(lambda x: len("\n".join(x['context']['contexts'])) < 1024) # avoid long traces
    data_qa = data_qa.select(range(5000)).map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT['en']},
            {
                "role": "user",
                "content": "Given the scientific context below:\n" +
                          "\n".join(x['context']['contexts']) +
                          "\n\nAnswer the following question:\n" +
                          x['question'] +
                          " with 'yes', 'no' or 'maybe'. You need to carefully review the context and reason before answering."
            },
        ],
        'answer': x['final_decision'],
        'db_set': 'pubmedqa'
        
    }) # type: ignore
    data_qa = data_qa.remove_columns(['pubid', 'question', 'context', 'long_answer', 'final_decision'])


    categories =['Lab_Medicine', 'Wearables', 'Dermatology', 'Gastroenterology', 'Internal_Medicine', 'Oncology', 'Orthopedics', 'General_Surgery', 'Ophthalmology', 'Audiology', 'Head_Neck_Surgery', 'Elderly_Care', 'Pediatrics', 'Allergy_Immunology', 'Rheumatology', 'Pharmacy', 'Obstetrics_Gynecology', 'Microbiology', 'Dentistry', 'Physical_Medicine_and_Rehabilitation', 'Neurology', 'Psychiatry', 'Pathology', 'Genetics', 'Rare_Diseases', 'Hematology', 'Emergency', 'Endocrinology', 'Radiology', 'Cardiology', 'Pulmonology', 'Infectious_Diseases', 'Critical_Care', 'Pediatric_Surgery', 'Neuroscience', 'Epidemiology', 'Fitness_Sports', 'Health_Education', 'Health_Economics', 'Health_Entrepreneurship', 'Hospital_Management', 'Mental_Health', 'Nutrition', 'Palliative_Care', 'Preventive_Medicine', 'Public_Health', 'Social_Media_Addiction', 'Sleep', 'Supplements', 'Vaccination', 'Work_Health', 'Wellbeing']
    data_mc = concatenate_datasets([load_dataset("yesilhealth/Health_Benchmarks",i,cache_dir='data/Health_Benchmarks')[i] for i in categories])
    data_mc = data_mc.select(range(5000)).map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT['en']},
            {
                "role": "user",
                "content": "\n\nAnswer the following question:\n" +
                          x['Questions'] +
                          "\n With 'A', 'B', 'C' or 'D'. You need to carefully review the context and reason before answering."
            },
        ],
        'answer': x['Answers'],
        'db_set': 'med_mc'
    }) # type: ignore
    data_mc = data_mc.remove_columns(['Answers', 'Questions'])

    dataset = concatenate_datasets([data, data_qa, data_mc])
    return dataset

In [2]:
dataset = get_datasets()
dataset = dataset.shuffle(seed=42)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]
print(f"train size: {len(train_dataset)}, test size: {len(test_dataset)}")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

train size: 13500, test size: 1500


In [3]:
test_dataset[0]

{'answer': 'yes',
 'prompt': [{'content': '\nRespond in the following format:\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': "Given the scientific context below:\nThis study described information management incidents and adverse event reporting choices of health professionals.\nHospital adverse events reported in an anonymous electronic reporting system were analysed using directed content analysis and descriptive and inferential statistics. The data consisted of near miss and adverse event incident reports (n = 3075) that occurred between January 2008 and the end of December 2009.\nA total of 824 incidents were identified. The most common information management incident was failure in written information transfer and communication, when patient data were copied or documented incorrectly. Often patient data were transferred using paper even though an electronic patient record was in use. Reporting choices differed significantly among professio

In [4]:
test_dataset[1]

{'answer': 'yes',
 'prompt': [{'content': '\nRespond in the following format:\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': "Given the scientific context below:\nThe use of doxorubicin (DOX) as a chemotherapeutic agent is limited by cardiac injury. Iloprost, a stable synthetic analogue of prostacyclin, has previously been shown to protect against DOX-induced cardiomyocyte injury in vitro. Here, we addressed whether iloprost is cardioprotective in vivo and whether it compromises the anti-tumour efficacy of DOX.\nLewis Lung Carcinoma cells were implanted subcutaneously in the flank of C57BL/6 mice. DOX treatment was commenced from when tumours became visible. Iloprost was administered from prior to DOX treatment until sacrifice. Echocardiography and invasive haemodynamic measurements were performed immediately before sacrifice. As expected, DOX induced cardiac cell apoptosis and cardiac dysfunction, both of which were attenuated by iloprost. Al

In [5]:
train_dataset.save_to_disk("train_dataset")
test_dataset.save_to_disk("test_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/13500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1500 [00:00<?, ? examples/s]

In [6]:
from datasets import load_from_disk

# 加载之前保存的 test_dataset
test_dataset = load_from_disk("test_dataset")
train_dataset = load_from_disk("train_dataset")

In [7]:
train_dataset

Dataset({
    features: ['answer', 'prompt', 'db_set'],
    num_rows: 13500
})

In [8]:
test_dataset[0]

{'answer': 'yes',
 'prompt': [{'content': '\nRespond in the following format:\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': "Given the scientific context below:\nThis study described information management incidents and adverse event reporting choices of health professionals.\nHospital adverse events reported in an anonymous electronic reporting system were analysed using directed content analysis and descriptive and inferential statistics. The data consisted of near miss and adverse event incident reports (n = 3075) that occurred between January 2008 and the end of December 2009.\nA total of 824 incidents were identified. The most common information management incident was failure in written information transfer and communication, when patient data were copied or documented incorrectly. Often patient data were transferred using paper even though an electronic patient record was in use. Reporting choices differed significantly among professio

In [9]:
test_dataset[1]

{'answer': 'yes',
 'prompt': [{'content': '\nRespond in the following format:\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': "Given the scientific context below:\nThe use of doxorubicin (DOX) as a chemotherapeutic agent is limited by cardiac injury. Iloprost, a stable synthetic analogue of prostacyclin, has previously been shown to protect against DOX-induced cardiomyocyte injury in vitro. Here, we addressed whether iloprost is cardioprotective in vivo and whether it compromises the anti-tumour efficacy of DOX.\nLewis Lung Carcinoma cells were implanted subcutaneously in the flank of C57BL/6 mice. DOX treatment was commenced from when tumours became visible. Iloprost was administered from prior to DOX treatment until sacrifice. Echocardiography and invasive haemodynamic measurements were performed immediately before sacrifice. As expected, DOX induced cardiac cell apoptosis and cardiac dysfunction, both of which were attenuated by iloprost. Al

In [10]:
test_dataset[3]

{'answer': 'B',
 'prompt': [{'content': '\nRespond in the following format:\n<think>\n...\n</think>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': "\n\nAnswer the following question:\nWhich of the following genetic conditions is characterized by disproportionate short stature and severe immunodeficiency due to adenine deaminase deficiency? A:Bloom’s syndrome   B:Adenine deaminase deficiency   C:Ataxia-telangiectasia D:Anhidrotic ectodermal dysplasia\n With 'A', 'B', 'C' or 'D'. You need to carefully review the context and reason before answering.",
   'role': 'user'}],
 'db_set': 'med_mc'}

In [11]:
test_dataset


Dataset({
    features: ['answer', 'prompt', 'db_set'],
    num_rows: 1500
})