# Set Up

In [2]:
from datasets import load_dataset, concatenate_datasets, ClassLabel
import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
from openai import OpenAI
import ast
import xml.etree.ElementTree as ET

  from .autonotebook import tqdm as notebook_tqdm


# RareBench Dataset

In [7]:
datasets = ["RAMEDIS", "MME", "HMS", "LIRICAL"]
combined_data = []

for dataset in datasets:
    data = load_dataset("chenxz/RareBench", dataset, split="test")
    combined_data.append(data)

combined_dataset = concatenate_datasets(combined_data)
print(combined_dataset)
print(combined_dataset[0])

Dataset({
    features: ['Phenotype', 'RareDisease', 'Department'],
    num_rows: 1122
})
{'Phenotype': ['HP:0001522', 'HP:0001942', 'HP:0003210', 'HP:0003819'], 'RareDisease': ['OMIM:251000', 'ORPHA:27', 'CCRD:71'], 'Department': None}


In [10]:
#mapping phenotype
with open("phenotype_mapping.json", "r") as f:
    pheno_mapping = json.load(f)

def replace_codes_with_names(example):
    updated_pheno = []
    for code in example["Phenotype"]:
        if code in pheno_mapping:
            updated_pheno.append(pheno_mapping[code])
        else:
            updated_pheno.append(code)
            print(code)
    example["Phenotype"] = updated_pheno
    return example

combined_dataset = combined_dataset.map(replace_codes_with_names)

print(combined_dataset[4])

{'Phenotype': ['Hydronephrosis', 'Delayed speech and language development', 'Abnormality of prenatal development or birth', 'Intellectual disability', 'Motor delay', 'Generalized hypotonia', 'Death in infancy', 'Fever', 'Opisthotonus', 'Athetosis', 'Kyphoscoliosis', 'Neonatal death', 'Death in childhood', 'Hypervalinemia', 'Hyperleucinemia', 'Hyperisoleucinemia', 'Death in adolescence', 'Feeding difficulties', 'Elevated urinary carboxylic acid'], 'RareDisease': ['OMIM:248600', 'ORPHA:511', 'CCRD:67'], 'Department': None}


In [11]:
#mapping disease
with open("raw_data/raw_rarebench_data/disease_mapping.json", "r") as f:
    disease_mapping = json.load(f)

def replace_codes_with_names(example):
    updated_diseases = []
    for code in example["RareDisease"]:
        if code in disease_mapping:
            updated_diseases.append(disease_mapping[code])
        else:
            updated_diseases.append(code)
    updated_diseases = list(set(updated_diseases))
    example["RareDisease"] = updated_diseases
    return example

combined_dataset = combined_dataset.map(replace_codes_with_names)

print(combined_dataset[8])

{'Phenotype': ['Macrocephaly', 'Cystic hygroma', 'Lethargy', 'Generalized hypotonia', 'Dystonia', 'Death in infancy', 'Fever', 'Vomiting', 'Diarrhea', 'Dyspnea', 'Drowsiness', 'Sleep disturbance', 'Glutaric aciduria', 'Neonatal death', 'Decreased plasma free carnitine', 'Dyskinesia'], 'RareDisease': ['戊二酸血症 I 型/Glutaric acidemia type I; GA-I/Glutaryl-CoA dehydrogenase deficiency/Glutaric acidemia I'], 'Department': None}


In [None]:
#annotate with deepseek

results = []
client = OpenAI(api_key="urmom", base_url="https://api.deepseek.com")

for example in combined_dataset:
    symptoms = example["Phenotype"]
    diseases = example["RareDisease"]
    response = client.chat.completions.create(
        model="deepseek-reasoner",
        messages=[
            {"role": "system", "content": "You are a specialist in the field of rare diseases. Your task is to analyze patient symptoms and provide a precise, structured diagnosis with step-by-step reasoning."},
            {"role": "user", "content": "A patient shows the symptoms " + str(symptoms) + ". Diagnose the patient with " + str(diseases) + ". Let's think step by step. For each diagnosed disease, enter in the format Disease: Reasoning on a new line"},
        ],
        stream=False
    )
    results.append({
        "symptoms": symptoms,
        "diseases": diseases,
        "diagnosis": response.choices[0].message.content
    })

results_df = pd.DataFrame(results)
results_df.to_csv("intermediate_data/diagnosis_results.csv", index=False)

In [12]:
results = []

for i in range(512, len(combined_dataset)):
    example = combined_dataset[i]
    symptoms = example["Phenotype"]
    diseases = example["RareDisease"]
    response = client.chat.completions.create(
        model="deepseek-reasoner",
        messages=[
            {"role": "system", "content": "You are a specialist in the field of rare diseases. Your task is to analyze patient symptoms and provide a precise, structured diagnosis with step-by-step reasoning."},
            {"role": "user", "content": "A patient shows the symptoms " + str(symptoms) + ". Diagnose the patient with " + str(diseases) + ". Let's think step by step. For each diagnosed disease, enter in the format Disease: Reasoning on a new line"},
        ],
        stream=False
    )
    results.append({
        "symptoms": symptoms,
        "diseases": diseases,
        "diagnosis": response.choices[0].message.content
    })

results_df2 = pd.DataFrame(results)
results_df2.to_csv("intermediate_data/diagnosis_results2.csv", index=False)

In [2]:
rarebench_data = pd.concat( 
    [pd.read_csv('intermediate_data/translated_diagnosis_results.csv'), pd.read_csv('intermediate_data/translated_diagnosis_results2.csv')], ignore_index=True) 
rarebench_data = rarebench_data.sample(frac=1)
display(rarebench_data)
rarebench_data.to_csv("intermediate_data/rarebench_data.csv", index=False)

Unnamed: 0,symptoms,diseases,diagnosis
774,"['Intellectual disability', 'Hypotonia', 'Glob...",['TBCK-related intellectual disability syndrom...,Disease: TBCK-related intellectual disability ...
184,"['Death in infancy', 'Aminoaciduria', 'Abnorma...",['3-Methylcrotonyl-CoA carboxylase 1 deficienc...,Disease: 3-Methylcrotonyl-CoA carboxylase 1 de...
712,"['Weight loss', 'Arthralgia', 'Elevated circul...","['Scleroderma, familial progressive', 'Systemi...",Systemic sclerosis/Systemic scleroderma; SSc/S...
421,"['Seizure', 'Spastic diplegia', 'Encephalopath...",['Glutaric acidemia type I; GA-I/Glutaryl-CoA ...,Disease: Glutaric acidemia type I (GA-I)/Gluta...
15,"['Wide mouth', 'Epicanthus', 'Carious teeth', ...","['Hyperinsulinemic hypoglycemia, familial, 2',...","Disease: Hyperinsulinemic hypoglycemia, famili..."
...,...,...,...
220,"['Hypospadias', 'Microcephaly', 'Motor delay',...",['Smith-Lemli-Opitz syndrome/Smith-Lemli-Opitz...,Smith-Lemli-Opitz syndrome: \n1. **Biochemica...
926,"['Microcephaly', 'Ptosis', 'Myopia', 'Autism',...",['Intellectual developmental disorder with dys...,Disease: Intellectual developmental disorder w...
72,"['Death in infancy', 'Fever', 'Vomiting', 'Dia...",['Isovaleric acedemia; IVA/Isovaleric acidemia...,Disease: Isovaleric acidemia (IVA) \nReasonin...
885,"['Cleft palate', 'Choanal atresia', 'Elevated ...","['Bamforth-Lazarus syndrome/Hypothyroidism, at...",Disease: Bamforth-Lazarus syndrome/Hypothyroid...


In [4]:
rb_counts = {}

for _, example in rarebench_data.iterrows():
  diseases = ast.literal_eval(example["diseases"])
  for i in diseases:
    if i in rb_counts:
      rb_counts[i] += 1
    else:
      rb_counts[i] = 1

rb_test_counts = {}
rarebench_data_train = []
rarebench_data_test = []

for _, example in rarebench_data.iterrows():
  flag = False
  diseases = ast.literal_eval(example["diseases"])
  for i in diseases:
    if i not in rb_test_counts or rb_test_counts[i] < 0.10*rb_counts[i]:
      flag = True
      break
  if flag:
    for i in diseases:
      if i in rb_test_counts:
        rb_test_counts[i] += 1
      else:
        rb_test_counts[i] = 1
    rarebench_data_test.append(example)
  else:
    rarebench_data_train.append(example)

print(len(rarebench_data_train))
print(len(rarebench_data_test))

pd.DataFrame(rarebench_data_train).to_csv("intermediate_data/rarebench_data_train.csv", index=False)
pd.DataFrame(rarebench_data_test).to_csv("intermediate_data/rarebench_data_test.csv", index=False)
      

701
420


# ReDis-QA Formatted

In [3]:
redis_data = load_dataset("guan-wang/ReDis-QA")
redis_data = redis_data["test"]

print(redis_data)
print(redis_data[0])

Dataset({
    features: ['question', 'opa', 'opb', 'opc', 'opd', 'cop', 'rare disease', 'input'],
    num_rows: 1360
})
{'question': 'All are features of Abetalipoproteinemia, EXCEPT:', 'opa': 'Plasma levels of cholesterol and triglyceride are extremely low', 'opb': 'Manifest in early childhood with diarrhea', 'opc': 'Progressive pigmented retinopathy seen', 'opd': 'Neurological manifestation as ataxia in first decade', 'cop': 3, 'rare disease': ['Abetalipoproteinemia'], 'input': 'Question: All are features of Abetalipoproteinemia, EXCEPT:\nChoices:\nA. Plasma levels of cholesterol and triglyceride are extremely low\nB. Manifest in early childhood with diarrhea\nC. Progressive pigmented retinopathy seen\nD. Neurological manifestation as ataxia in first decade\nAnswer:'}


In [4]:
redis_data_train = []
for i in range(0, 1360):
    answer = "opa"
    if redis_data[i]["cop"] == 0:
        answer = "opa"
    elif redis_data[i]["cop"] == 1:
        answer = "opb"
    elif redis_data[i]["cop"] == 2:
        answer = "opc"
    elif redis_data[i]["cop"] == 3:
        answer = "opd"
    redis_data_train.append({
        "question": redis_data[i]["input"],
        "answer": redis_data[i][answer]
    })

redis_data_train = pd.DataFrame(redis_data_train)
display(redis_data_train)
redis_data_train.to_csv("raw_data/redis_data_train.csv", index=False)

Unnamed: 0,question,answer
0,Question: All are features of Abetalipoprotein...,Neurological manifestation as ataxia in first ...
1,Question: Abetalipoproteinemia is due to defic...,Mitochondrial Triglyceride Transfer Protein
2,Question: Small intestinal biopsy is diagnosti...,abd
3,Question: Intermittent dysphagia is caused by ...,de
4,Question: Following are radiological evidence ...,Exaggerated peristalsis
...,...,...
1355,Question: An infectious disease investigator i...,90/100.
1356,Question: Acute Myeloid Leukemia is a malignan...,Platelet
1357,Question: Chikungunya has spread widely from A...,Mutation in the virus allowing replication in ...
1358,Question: Measles has been controlled in many ...,By MMR vaccine


# Combined Training Data

In [56]:
train_data = pd.read_csv("intermediate_data/rarebench_data_train.csv")
train_data = train_data.drop("diseases", axis = 1)

for i in range(0, len(train_data)):
    train_data.iloc[i]['symptoms'] = "A patient shows the symptoms " + train_data.iloc[i]['symptoms'] + ". Diagnose the patient with a rare diseases. Let's think step by step."

train_data = train_data.rename(columns={'symptoms': 'question', 'diagnosis': 'answer'})
display(train_data)

Unnamed: 0,question,answer
0,A patient shows the symptoms ['Death in infanc...,Disease: Methylmalonic acidemia; MMA; Methylma...
1,A patient shows the symptoms ['Sloping forehea...,Disease: TBCK-related intellectual disability ...
2,"A patient shows the symptoms ['Lethargy', 'Gen...",Disease: Maple syrup urine disease (MSUD) \nR...
3,A patient shows the symptoms ['Urinary inconti...,Disease: Autosomal recessive spastic paraplegi...
4,A patient shows the symptoms ['Intellectual di...,Disease: Vitamin B12-unresponsive methylmaloni...
...,...,...
696,A patient shows the symptoms ['Death in infanc...,Disease: Phenylketonuria (PKU) \nReasoning: ...
697,"A patient shows the symptoms ['Vomiting', 'Eth...",Disease: Short chain acyl-CoA dehydrogenase de...
698,"A patient shows the symptoms ['Hypospadias', '...",Smith-Lemli-Opitz syndrome: \n1. **Biochemica...
699,A patient shows the symptoms ['Death in infanc...,Disease: Isovaleric acidemia (IVA) \nReasonin...


In [57]:
train_data = pd.concat([train_data, pd.read_csv("raw_data/redis_data_train.csv")], ignore_index=True)
display(train_data)

Unnamed: 0,question,answer
0,A patient shows the symptoms ['Death in infanc...,Disease: Methylmalonic acidemia; MMA; Methylma...
1,A patient shows the symptoms ['Sloping forehea...,Disease: TBCK-related intellectual disability ...
2,"A patient shows the symptoms ['Lethargy', 'Gen...",Disease: Maple syrup urine disease (MSUD) \nR...
3,A patient shows the symptoms ['Urinary inconti...,Disease: Autosomal recessive spastic paraplegi...
4,A patient shows the symptoms ['Intellectual di...,Disease: Vitamin B12-unresponsive methylmaloni...
...,...,...
2056,Question: An infectious disease investigator i...,90/100.
2057,Question: Acute Myeloid Leukemia is a malignan...,Platelet
2058,Question: Chikungunya has spread widely from A...,Mutation in the virus allowing replication in ...
2059,Question: Measles has been controlled in many ...,By MMR vaccine


In [58]:
train_data = train_data.assign(text = [1]*len(train_data))
print(len(train_data))
for i in range(0, len(train_data)):
    train_data.at[i, "text"] = "[INST] <<SYS>> You are a specialist in the field of rare diseases. <</SYS>> " + train_data.iloc[i]["question"] + " [/INST] " + train_data.iloc[i]["answer"] + " </s>"
train_data = train_data.drop(columns=['question', 'answer'])
train_data = train_data.sample(frac=1, ignore_index=True)
display(train_data)

2061


Unnamed: 0,text
0,[INST] <<SYS>> You are a specialist in the fie...
1,[INST] <<SYS>> You are a specialist in the fie...
2,[INST] <<SYS>> You are a specialist in the fie...
3,[INST] <<SYS>> You are a specialist in the fie...
4,[INST] <<SYS>> You are a specialist in the fie...
...,...
2056,[INST] <<SYS>> You are a specialist in the fie...
2057,[INST] <<SYS>> You are a specialist in the fie...
2058,[INST] <<SYS>> You are a specialist in the fie...
2059,[INST] <<SYS>> You are a specialist in the fie...


In [59]:
train_data.to_csv("final_traindata/final_train_data.csv", index = False)

# Orphadata Phenotypes

In [6]:
tree = ET.parse('raw_data/orphadata_phenotypes.xml')  
root = tree.getroot()
results = []

for child in root[1]:
    print(child[0][2].text)
    prompt= "[INST] <<SYS>> You are a specialist in the field of rare diseases. <</SYS>> What rare disease expresses phenotypes"
    for pheno in child[0][5]:
        prompt += " " + pheno[0][1].text + " as " + pheno[1][0].text + ","
    prompt = prompt[:len(prompt)-1] + "? [/INST] " + child[0][2].text + " </s>"
    print(prompt)
    results.append({"text": prompt})

results_df = pd.DataFrame(results)

withorpha_train_data = pd.concat([results_df, pd.read_csv("final_traindata/final_train_data.csv")], ignore_index=True)
withorpha_train_data = withorpha_train_data.sample(frac=1, ignore_index=True)
withorpha_train_data.to_csv("final_traindata/withorpha_train_data.csv", index=False)


Alexander disease
[INST] <<SYS>> You are a specialist in the field of rare diseases. <</SYS>> What rare disease expresses phenotypes Macrocephaly as Very frequent (99-80%), Intellectual disability as Very frequent (99-80%), Seizure as Very frequent (99-80%), Spasticity as Very frequent (99-80%), Agenesis of corpus callosum as Very frequent (99-80%), Hyperreflexia as Very frequent (99-80%), Megalencephaly as Very frequent (99-80%), Failure to thrive as Very frequent (99-80%), Frontal bossing as Very frequent (99-80%), Nausea and vomiting as Very frequent (99-80%), Abnormality of speech or vocalization as Very frequent (99-80%), Clonus as Very frequent (99-80%), EEG abnormality as Very frequent (99-80%), Sleep abnormality as Very frequent (99-80%), Scoliosis as Very frequent (99-80%), Abnormal pyramidal sign as Very frequent (99-80%), Large face as Very frequent (99-80%), Abnormality of eye movement as Frequent (79-30%), Ptosis as Frequent (79-30%), Nystagmus as Frequent (79-30%), Diplop