In [None]:
import pandas as pd
import spacy
import random
import re



In [None]:
!python -m spacy download en_core_web_md 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
nlp=spacy.load('en_core_web_md')

In [None]:
def get_sentence_vectors(text, nlp):
    
    # get tokens for each word in sentence
    embedding = nlp(text).vector.tolist()
    
    # return mean token
    return embedding

In [None]:
illness_df = pd.read_csv('/content/dia_t.csv')
symptom_df = pd.read_csv('/content/sym_t.csv')
links_df = pd.read_csv('/content/diffsydiw.csv')

In [None]:
source_data = (links_df
               .merge(illness_df, on="did")
               .merge(symptom_df, on="syd"))

In [None]:
source_data

Unnamed: 0,syd,did,wei,diagnose,symptom
0,1.0,163.0,2.0,Cholecystitisinflammation of the gallbladder,Upper abdominal pain
1,1.0,164.0,2.0,Choledocholithiasisstone in bile duct,Upper abdominal pain
2,1.0,165.0,1.0,Cholelithiasisgallstones,Upper abdominal pain
3,1.0,187.0,2.0,Constipation,Upper abdominal pain
4,1.0,306.0,2.0,Gastric ulcerstomach ulcer,Upper abdominal pain
...,...,...,...,...,...
5563,106.0,827.0,0.0,Vulvar squamous cell carcinomaskin cancer on ...,Vaginal bleeding after menopause
5564,186.0,966.0,2.0,Hair lossalopecia,Hair loss (Baldness)
5565,186.0,1415.0,0.0,Hypoparathyroidism,Hair loss (Baldness)
5566,186.0,1415.0,0.0,Hypoparathyroidism,Hair loss (Baldness)


In [None]:
# remove any missing data and select columns we need
source_data = source_data.loc[~(source_data['symptom'].isna())
                             & ~(source_data['diagnose'].isna()),
                             ['did', 'syd', 'diagnose', 'symptom']]
source_data.columns = ['illness_id', 'symptom_id', 'illness', 'symptom']

In [None]:
source_data.isna().sum()

illness_id    0
symptom_id    0
illness       0
symptom       0
dtype: int64

In [None]:
# tidy up some new messy characters
source_data['illness'] = source_data['illness'].str.replace('\x0b', ' ')
source_data['symptom'] = source_data['symptom'].str.replace('\x0b', ' ')

In [None]:
symptom_df = symptom_df.loc[~symptom_df['symptom'].isna()]
symptom_df['embedding'] = symptom_df.apply(lambda row: get_sentence_vectors(row['symptom'], nlp), axis = 1)
symptom_df.columns = ['symptom_id', 'symptom', 'symptom_vector']

# remove any messy characters
symptom_df['symptom'] = symptom_df['symptom'].str.replace('\x0b', ' ')

In [None]:
symptom_df

Unnamed: 0,symptom_id,symptom,symptom_vector
0,1,Upper abdominal pain,"[-0.3931533396244049, -1.122189998626709, -2.3..."
1,2,Lower abdominal pain,"[0.11224666982889175, -1.3984565734863281, -2...."
2,3,Abscess (Collection of pus),"[-4.394866943359375, -5.325353145599365, 3.294..."
3,4,Alcohol abuse,"[-1.0100150108337402, -3.2876999378204346, -1...."
4,5,Anxiety (Nervousness),"[-2.1256749629974365, -4.137800216674805, 4.55..."
...,...,...,...
267,295,Nipple discharge,"[-1.7614949941635132, 0.5206300020217896, -1.9..."
268,301,Shoulder stiffness or tightness,"[-1.2457798719406128, 2.181957483291626, -6.26..."
269,303,Arm stiffness or tightness,"[-1.7361524105072021, 1.2958674430847168, -4.8..."
270,304,High blood pressure,"[-0.959559977054596, 0.45383667945861816, -2.6..."


In [None]:
source_data.to_pickle('data/source_data.pkl')
symptom_df.to_pickle('data/symptoms.pkl')

In [None]:
# ist of illness
illnesses = list(source_data['illness'].drop_duplicates())

# list we will use to store our illness vectors
symptom_vectors = []

for illness in illnesses:
    
    illness_symptoms = list(source_data.loc[source_data["illness"]==illness, 'symptom'].drop_duplicates())
    
    symptom_df["related_to_illness"] = 0
    symptom_df.loc[symptom_df["symptom"].isin(illness_symptoms), "related_to_illness"] = 1
    
    
    symptom_vectors.append(list(symptom_df["related_to_illness"]))
    
diagnosis_data = pd.DataFrame({"illness":illnesses,
                              "illness_vector": symptom_vectors})

In [None]:
diagnosis_data

Unnamed: 0,illness,illness_vector
0,Cholecystitis inflammation of the gallbladder,"[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Choledocholithiasis stone in bile duct,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Cholelithiasis gallstones,"[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,Constipation,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Gastric ulcer stomach ulcer,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
1151,Skin swelling,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1152,Eye trauma injury,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1153,Endometrial cancer cancer of the lining of the...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1154,Hair loss alopecia,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
number_of_symptoms = [1, 2, 3, 4]
start_of_description = [
    "I have",
    "I'm suffering from",
    "I have really bad",
    "My symptoms are",
    "For the last few days I have had",
    "My husband is suffering from" ,
    "My wife is suffering from",
    "My son is suffering from",
    "My daughter is suffering from",
    "My child is suffering from",
    "I don't feel well, I have"
]

# get some examples of users describing different numbers of syptoms
for symptons_count in number_of_symptoms:
    
    # make 100 examples of each number of symptoms
    for ex in range(1, 101):
    
        description_beginning = random.choice(start_of_description)
        
        # collect some random symtpoms
        symptom_1 = symptom_df['symptom'].sample(1).iloc[0].lower()
        symptom_2 = symptom_df['symptom'].sample(1).iloc[0].lower()
        symptom_3 = symptom_df['symptom'].sample(1).iloc[0].lower()
        symptom_4 = symptom_df['symptom'].sample(1).iloc[0].lower()
        
        symptoms = [symptom_1, symptom_2, symptom_3, symptom_4]
        symptoms_entity = []
        
        # remove parenthases from symptoms and add nessecary entitiy tags to symptoms
        for symptom in symptoms:
            symptom = re.sub(r"\([^)]+\)", "", symptom).strip()
            symptom = f"[{symptom}](symptom)"
            symptoms_entity.append(symptom)
            
        symptom_1 = symptoms_entity[0]
        symptom_2 = symptoms_entity[1]
        symptom_3 = symptoms_entity[2]
        symptom_4 = symptoms_entity[3]
        
        # create the training sample string
        if symptons_count == 1:
            
            symptom_string = f"- {description_beginning} {symptom_1}"
            
        if symptons_count == 2:
            
            symptom_string = f"- {description_beginning} {symptom_1} and {symptom_2}"
            
        if symptons_count == 3:
            
            symptom_string = f"- {description_beginning} {symptom_1}, {symptom_2}, and {symptom_3}"
            
        if symptons_count == 4:
            
            symptom_string = f"- {description_beginning} {symptom_1}, {symptom_2}, {symptom_3}, {symptom_4}"
        
        print(symptom_string)

- My symptoms are [fainting](symptom)
- My daughter is suffering from [eye redness](symptom)
- I'm suffering from [shoulder stiffness or tightness](symptom)
- My child is suffering from [lower abdominal pain](symptom)
- For the last few days I have had [chest pain](symptom)
- My husband is suffering from [lethargy](symptom)
- I don't feel well, I have [vomiting](symptom)
- I have really bad [increased facial hair](symptom)
- I have [psychiatric problem](symptom)
- My son is suffering from [delusions or hallucinations](symptom)
- I have really bad [vaginal pain](symptom)
- My wife is suffering from [darkening of the skin](symptom)
- I have [melena](symptom)
- My son is suffering from [lethargy](symptom)
- I have [shoulder ache or pain](symptom)
- For the last few days I have had [numbness](symptom)
- I have really bad [confusion](symptom)
- My wife is suffering from [penis inflammation or swelling](symptom)
- For the last few days I have had [fever of unknown origin](symptom)
- My son i