In [19]:
import pandas as pd
import json

# read in scraped mayo symptoms
with open("mayo_data.json", 'r') as j:
     data = json.loads(j.read())

symptoms=[]
specialties=[]
diseases=[]

for disease in list(data.keys()):
    list_of_symptoms=data[disease]['symptoms']
    if len(list_of_symptoms)==0:
        pass
    symptoms.append(list_of_symptoms)
    diseases.append([disease]*len(list_of_symptoms))
    list_of_specialties=data[disease]['specialties']
    specialtystr=','.join(list_of_specialties)
    specialties.append([specialtystr]*len(list_of_symptoms))

def flatten(l):
    return [item for sublist in l for item in sublist]

diseases=flatten(diseases)
symptoms=flatten(symptoms)
specialties=flatten(specialties)

lst = [diseases,symptoms,specialties]
df = pd.DataFrame(
    {'disease': diseases,
     'symptoms': symptoms,
     'specialties': specialties
    })

In [20]:
input_value = df.groupby('disease', as_index=False).agg(lambda x: ' '.join(x.unique()))
# df['symptoms'] = df['symptoms'].str.split()
input_value

Unnamed: 0,disease,symptoms,specialties
0,ACL injury,"Rapid swelling A loud pop or a ""popping"" sensa...","Physical Medicine and Rehabilitation,Sports Me..."
1,ARDS,Severe shortness of breath Labored and unusual...,"Pulmonary Medicine,Critical Care,Extracorporea..."
2,Abdominal aortic aneurysm,"Back pain Deep, constant pain in the belly are...","Vascular centers,Vascular and Endovascular Sur..."
3,Achalasia,"Inability to swallow (dysphagia), which may fe...","Pediatric Surgery,Gastroenterology and Hepatol..."
4,Achilles tendinitis,Achilles tendinitis Achilles tendinitis is an ...,"Physical Medicine and Rehabilitation,Sports Me..."
...,...,...,...
531,Vulvodynia,Rawness Vulva Burning Painful intercourse (dys...,"Physical Medicine and Rehabilitation,Obstetric..."
532,Waldenstrom macroglobulinemia,Easy bruising Fever Shortness of breath Bleedi...,"Transplant Center,Hematology,Bone Marrow Trans..."
533,Wet macular degeneration,Decreased intensity or brightness of colors In...,Ophthalmology
534,Wilms' tumor,Fever Shortness of breath An abdominal mass yo...,"Pediatric Hematology/Oncology in Minnesota,Chi..."


In [21]:
file = open("UserInputSymptoms.plk",'rb')
output = pd.read_pickle(file)
output

Unnamed: 0,disease,userquery
0,Aortic dissection,"\n-sudden and severe chest pain, often describ..."
1,Chronic daily headaches,\nThere is no definitive answer to this questi...
2,Nephrotic syndrome,\n-Fluid retention (edema)\n-Weight gain\n-Foa...
3,Dizziness,\nThe symptoms of dizziness can vary depending...
4,Emphysema,\nThe main symptom of emphysema is shortness o...
...,...,...
531,Pancreatic neuroendocrine tumors,\nThe most common symptom of a pancreatic neur...
532,Hearing loss,\nSome symptoms of hearing loss include strugg...
533,Retinoblastoma,\n-One or both eyes may look enlarged\n-White ...
534,Rumination syndrome,\nThere is no one definitive symptom of rumina...


In [25]:
output['type'] = 'userquery'
input_value['type'] = 'system'
output = output.rename(columns={'userquery': 'symptoms'})
result = pd.concat([input_value, output], axis=0, join='inner')
display(result)

Unnamed: 0,disease,symptoms,type
0,ACL injury,"Rapid swelling A loud pop or a ""popping"" sensa...",system
1,ARDS,Severe shortness of breath Labored and unusual...,system
2,Abdominal aortic aneurysm,"Back pain Deep, constant pain in the belly are...",system
3,Achalasia,"Inability to swallow (dysphagia), which may fe...",system
4,Achilles tendinitis,Achilles tendinitis Achilles tendinitis is an ...,system
...,...,...,...
531,Pancreatic neuroendocrine tumors,\nThe most common symptom of a pancreatic neur...,userquery
532,Hearing loss,\nSome symptoms of hearing loss include strugg...,userquery
533,Retinoblastoma,\n-One or both eyes may look enlarged\n-White ...,userquery
534,Rumination syndrome,\nThere is no one definitive symptom of rumina...,userquery


In [26]:
result = result.replace('\n|-','', regex=True)
result['symptoms'] = result['symptoms'].str.split()
result

Unnamed: 0,disease,symptoms,type
0,ACL injury,"[Rapid, swelling, A, loud, pop, or, a, ""poppin...",system
1,ARDS,"[Severe, shortness, of, breath, Labored, and, ...",system
2,Abdominal aortic aneurysm,"[Back, pain, Deep,, constant, pain, in, the, b...",system
3,Achalasia,"[Inability, to, swallow, (dysphagia),, which, ...",system
4,Achilles tendinitis,"[Achilles, tendinitis, Achilles, tendinitis, i...",system
...,...,...,...
531,Pancreatic neuroendocrine tumors,"[The, most, common, symptom, of, a, pancreatic...",userquery
532,Hearing loss,"[Some, symptoms, of, hearing, loss, include, s...",userquery
533,Retinoblastoma,"[One, or, both, eyes, may, look, enlargedWhite...",userquery
534,Rumination syndrome,"[There, is, no, one, definitive, symptom, of, ...",userquery


In [27]:
import itertools

## Jaccard similarity functions obtained from https://stackoverflow.com/questions/71635040/get-jaccard-similarity-by-comparing-all-rows-in-a-pandas-dataframe-while-keeping
def jaccard_similarity(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    if union_cardinality == 0:
        return 1
    return intersection_cardinality/float(union_cardinality)

wordings_users = list(zip(result["symptoms"], result["disease"], result["type"]))

result = []

for item in list(itertools.combinations(wordings_users, 2)):
    similarity = jaccard_similarity(item[0][0], item[1][0])
    data = {"disease1": item[0][1], "disease2": item[1][1], "disease1_type": item[0][2], "disease2_type": item[1][2], "similarity": similarity}
    result.append(data)

result = pd.DataFrame(result)
result

Unnamed: 0,disease1,disease2,disease1_type,disease2_type,similarity
0,ACL injury,ARDS,system,system,0.071429
1,ACL injury,Abdominal aortic aneurysm,system,system,0.150000
2,ACL injury,Achalasia,system,system,0.112903
3,ACL injury,Achilles tendinitis,system,system,0.054545
4,ACL injury,Achilles tendon rupture,system,system,0.172414
...,...,...,...,...,...
574051,Hearing loss,Rumination syndrome,userquery,userquery,0.063492
574052,Hearing loss,Bell's palsy,userquery,userquery,0.142857
574053,Retinoblastoma,Rumination syndrome,userquery,userquery,0.042553
574054,Retinoblastoma,Bell's palsy,userquery,userquery,0.069767


In [29]:
result = result[result['disease1_type'] != result['disease2_type']]
result = result.sort_values("similarity", ascending=False)
result = result.drop_duplicates(subset=["disease2"], keep="first")
result[result['disease1'] == result['disease2']]

Unnamed: 0,disease1,disease2,disease1_type,disease2_type,similarity
96874,Bunions,Bunions,system,userquery,0.500000
407381,Testicular cancer,Testicular cancer,system,userquery,0.395833
281745,Iritis,Iritis,system,userquery,0.388889
414405,Trigger finger,Trigger finger,system,userquery,0.345455
421286,Uterine fibroids,Uterine fibroids,system,userquery,0.307692
...,...,...,...,...,...
22261,Alcohol use disorder,Alcohol use disorder,system,userquery,0.123153
406783,Tension headache,Tension headache,system,userquery,0.120000
335999,Obstructive sleep apnea,Obstructive sleep apnea,system,userquery,0.119403
227022,Gingivitis,Gingivitis,system,userquery,0.115385


> Comment: Got 69 right out of the 536 inputs, about 12.87% accuracy. 

## Simple Test on Jaccard Similarity

In [4]:
def Convert(string):
    li = list(string.split(" "))
    return li

## symptoms from aortic dissection disease
str1 = "Sudden severe chest or upper back pain, often described as a tearing or ripping sensation, that spreads to the neck or down the back \
Sudden severe stomach pain Loss of consciousness Shortness of breath Symptoms similar to those of a stroke, including sudden vision problems, difficulty speaking, and weakness or loss of movement (paralysis) on one side of your body \
Weak pulse in one arm or thigh compared with the other Leg pain Difficulty walking"

## sample user input 
str2 = "I have shortness of breath, leg pain, loss of consciousness"
list1 = Convert(str1)
list2 = Convert(str2)

def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

print(jaccard_similarity(list1, list2))

0.06349206349206349


## Jaccard Similarity between Diseases

In [6]:
import itertools
import pandas as pd

# copied from OP above
def jaccard_similarity(x, y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    if union_cardinality == 0:
        return 1
    return intersection_cardinality/float(union_cardinality)

# create list of tuples like [(symptoms, disease), (symptoms, disease)]
wordings_users = list(zip(df["symptoms"], df["disease"]))

result = []

# create list of all possible combinations between sets of (symptoms, disease) and loop through them
for item in list(itertools.combinations(wordings_users, 2)):
    similarity = jaccard_similarity(item[0][0], item[1][0])
    data = {"disease1": item[0][1], "disease2": item[1][1], "similarity": similarity}
    result.append(data)

df_final = pd.DataFrame(result)
df_final = df_final.sort_values(by = 'similarity', ascending=False)
df_final.head(20)

Unnamed: 0,disease1,disease2,similarity
80187,Dry macular degeneration,Wet macular degeneration,1.0
94167,Floor of the mouth cancer,Soft palate cancer,0.8125
114182,Immune thrombocytopenia (ITP),Thrombocytopenia (low platelet count),0.779661
87683,Epilepsy,Seizures,0.766667
124112,Lymphoma,Non,0.75
135850,Prediabetes,Type 2 diabetes,0.724138
110798,Hodgkin's lymphoma (Hodgkin's disease),Lymphoma,0.7
110837,Hodgkin's lymphoma (Hodgkin's disease),Non,0.641509
60415,Chronic kidney disease,End,0.633333
41907,Broken hand,Broken wrist,0.625


> Comments: Some of the diseases are very similar in symptoms, for example 'floor of the mouth cancer' and 'soft palate cancer', 'colon cancer' and 'rectal cancer', 'cervical cancer and 'vaginal cancer' etc. 