**CSCI 4022 - Final Project**
**Disease Prediction Using Symptoms**

Team Members: Toby Savage & Sahand Setareh

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

**Dataset**

The dataset that we have been working with is the Disease Symptom Prediction set provided on Kaggle. The dataset consists of diseases and their associated symptoms (such as Malaria and corresponding chills, high fever, etc.).  Additionally, the dataset provides adjacent CSVs concerning symptom severity, symptom descriptions, and symptom precautions. They collectively provide insight into various diseases, the details about those symptoms, the severity of those symptoms, and the proper precautions to be taken as a precaution. 

We are using the A-priori algorithm to determine frequent pairings of symptoms and their corresponding diseases. Using this knowledge, we hope to provide diagnostic predictions based on any pairing of symptoms.

In [2]:
df = pd.read_csv('./data/dataset.csv')
# df2 = pd.read_csv('./data/symptom_precaution.csv')

In [3]:
df
# df2

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


In [4]:
df['Disease'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [5]:
# Creating a dictionary where diseases are the keys and all of their possible symptoms are the values
diseases = {}
for d in df['Disease'].unique():
    diseases[d] = []
for d in diseases.keys():
    for i, row in enumerate(df[df['Disease'] == d]):
        diseases[d].append([s for s in df[df['Disease'] == d].iloc[i, 1:] if not pd.isna(s)])
    diseases[d] = {s.lstrip() for l in diseases[d] for s in l}
diseases

{'Fungal infection': {'dischromic _patches',
  'itching',
  'nodal_skin_eruptions',
  'skin_rash'},
 'Allergy': {'chills',
  'continuous_sneezing',
  'shivering',
  'watering_from_eyes'},
 'GERD': {'acidity',
  'chest_pain',
  'cough',
  'stomach_pain',
  'ulcers_on_tongue',
  'vomiting'},
 'Chronic cholestasis': {'abdominal_pain',
  'itching',
  'loss_of_appetite',
  'nausea',
  'vomiting',
  'yellowing_of_eyes',
  'yellowish_skin'},
 'Drug Reaction': {'burning_micturition',
  'itching',
  'skin_rash',
  'spotting_ urination',
  'stomach_pain'},
 'Peptic ulcer diseae': {'abdominal_pain',
  'indigestion',
  'internal_itching',
  'loss_of_appetite',
  'passage_of_gases',
  'vomiting'},
 'AIDS': {'extra_marital_contacts',
  'high_fever',
  'muscle_wasting',
  'patches_in_throat'},
 'Diabetes ': {'blurred_and_distorted_vision',
  'excessive_hunger',
  'fatigue',
  'increased_appetite',
  'irregular_sugar_level',
  'lethargy',
  'obesity',
  'polyuria',
  'restlessness',
  'weight_loss'},


In [6]:
# All possible symptoms in the dataset
symptoms = [list(diseases.values())[i] for i in range(len(diseases.values()))]
symptoms = {s.lstrip() for l in symptoms for s in l}
symptoms = list(symptoms)
symptoms

['yellowish_skin',
 'blackheads',
 'brittle_nails',
 'malaise',
 'stiff_neck',
 'prominent_veins_on_calf',
 'continuous_sneezing',
 'swollen_extremeties',
 'visual_disturbances',
 'depression',
 'altered_sensorium',
 'painful_walking',
 'swollen_blood_vessels',
 'polyuria',
 'spotting_ urination',
 'nodal_skin_eruptions',
 'fatigue',
 'swelling_of_stomach',
 'swelled_lymph_nodes',
 'bloody_stool',
 'patches_in_throat',
 'indigestion',
 'throat_irritation',
 'chills',
 'family_history',
 'increased_appetite',
 'foul_smell_of urine',
 'silver_like_dusting',
 'internal_itching',
 'skin_rash',
 'irritation_in_anus',
 'redness_of_eyes',
 'palpitations',
 'swollen_legs',
 'fluid_overload',
 'runny_nose',
 'puffy_face_and_eyes',
 'anxiety',
 'continuous_feel_of_urine',
 'high_fever',
 'ulcers_on_tongue',
 'fast_heart_rate',
 'muscle_wasting',
 'drying_and_tingling_lips',
 'neck_pain',
 'passage_of_gases',
 'red_sore_around_nose',
 'bladder_discomfort',
 'skin_peeling',
 'dischromic _patches',

**Implementation of A-priori**

When running the A-priori algorithm, we opted to use 85th percentile support for frequent occurrences of single symptoms. This was also used to calculate frequent symptom pairings. The results of the frequent pairs are as follows:

In [7]:
# generateFrequentSingles() takes in the list of diseases and a percentile-based threshold
# using this, we obtain frequent singleton occurences of specific symptoms from the dataframe
# Inspired by and modified Homework 5 implementation
# def generateFrequentSingles(diseases, percentile):

# NOTE: This was an experiment in treating the patients as "baskets"
    
#     counts = np.zeros(len(symptoms)) # frequent items matrix
#     for index, row in df.iterrows():
#         for s in row[1:]:
#             if not pd.isna(s):
#                 counts[symptoms.index(s.lstrip())] += 1 # if the symptom is not 'nan', increment count by 1
#     frequency_table = {}
#     for d in diseases.keys():
#         frequent_symptoms = list()
#         for s in diseases[d]:
#             if counts[symptoms.index(s)] >= np.percentile(counts, percentile): # if the item at 'index' is a frequent item
#                 frequency_table[s] = counts[symptoms.index(s)] 
#                 frequent_symptoms.append(s)
#     return frequency_table

In [8]:
# generateFrequentSingles() takes in the list of diseases and a percentile-based threshold
# using this, we obtain frequent singleton occurences of specific symptoms from the dataframe
# Inspired by and modified Homework 5 implementation

# NOTE: This is our implementation where we treat the diseases and their possible symptoms as "baskets"

def generateFrequentSingles(diseases, percentile):
    
    counts = np.zeros(len(symptoms)) # frequent items matrix
    for d in diseases.keys():
        for s in diseases[d]:
                counts[symptoms.index(s.lstrip())] += 1 
    frequency_table = {}
    for d in diseases.keys():
        frequent_symptoms = list()
        for s in diseases[d]:
            if counts[symptoms.index(s)] >= np.percentile(counts, percentile): # if the item at 'index' is a frequent item
                frequency_table[s] = counts[symptoms.index(s)] 
                frequent_symptoms.append(s)
    return frequency_table

In [9]:
# Frequent symptom singletons with number of occurences
frequency_table = generateFrequentSingles(diseases, 85)
print("List of sorted frequent symptoms and their counts: \n")
print(sorted([[i, int(j)] for i, j in zip(frequency_table.keys(), 
                                     frequency_table.values())], key = lambda x: x[1], reverse = True))

List of sorted frequent symptoms and their counts: 

[['vomiting', 17], ['fatigue', 17], ['high_fever', 12], ['loss_of_appetite', 10], ['nausea', 10], ['headache', 10], ['abdominal_pain', 9], ['yellowish_skin', 8], ['skin_rash', 7], ['chills', 7], ['yellowing_of_eyes', 7], ['itching', 6], ['chest_pain', 6], ['sweating', 6], ['malaise', 6], ['joint_pain', 6], ['cough', 5], ['diarrhoea', 5], ['dark_urine', 5], ['weight_loss', 4], ['excessive_hunger', 4], ['lethargy', 4], ['breathlessness', 4], ['irritability', 4], ['muscle_pain', 4]]


In [10]:
# Generate frequent item (symptom) pairs from frequent symptom singles using an 
# 85 percentile item occurence threshold
# Inspired by and modified Homework 5 implementation

frequency_table = generateFrequentSingles(diseases, 85)

frequent_items = [i for i in frequency_table.keys()]
C2 = np.zeros((len(frequent_items), len(frequent_items))) # upper-triangular matrix (frequent pairs)

trips = list()
for index_1 in range(0, len(frequent_items)):
    for index_2 in range(index_1 + 1, len(frequent_items)):
            trips.append((frequent_items[index_1], frequent_items[index_2], 0))
            
for d in diseases.keys():
    for i, t in enumerate(trips):
        if (t[0] in diseases[d]) and (t[1] in diseases[d]):
            trips[i] = (t[0], t[1], t[2] + 1)
            
for t in trips:
    C2[frequent_items.index(t[0])][frequent_items.index(t[1])] = t[2]

In [11]:
print('Frequent Pairs\n')
print([[t[0], t[1]] for t in trips])
print('\n-------------------------------------------------------------')
print('\nFrequent Pairs Matrix\n')
print(C2)
print('\n-------------------------------------------------------------\n')

# 15 most frequent pairs
print('15 Most frequent symptom pairs sorted by number of occurences: \n')
for t in sorted(trips, key = lambda x: x[2], reverse = True)[0:15]:
    print("{",t[0],",",t[1],"}, ", t[2])

Frequent Pairs

[['skin_rash', 'itching'], ['skin_rash', 'chills'], ['skin_rash', 'cough'], ['skin_rash', 'chest_pain'], ['skin_rash', 'vomiting'], ['skin_rash', 'yellowish_skin'], ['skin_rash', 'abdominal_pain'], ['skin_rash', 'yellowing_of_eyes'], ['skin_rash', 'loss_of_appetite'], ['skin_rash', 'nausea'], ['skin_rash', 'high_fever'], ['skin_rash', 'weight_loss'], ['skin_rash', 'excessive_hunger'], ['skin_rash', 'lethargy'], ['skin_rash', 'fatigue'], ['skin_rash', 'diarrhoea'], ['skin_rash', 'breathlessness'], ['skin_rash', 'headache'], ['skin_rash', 'irritability'], ['skin_rash', 'dark_urine'], ['skin_rash', 'sweating'], ['skin_rash', 'muscle_pain'], ['skin_rash', 'malaise'], ['skin_rash', 'joint_pain'], ['itching', 'chills'], ['itching', 'cough'], ['itching', 'chest_pain'], ['itching', 'vomiting'], ['itching', 'yellowish_skin'], ['itching', 'abdominal_pain'], ['itching', 'yellowing_of_eyes'], ['itching', 'loss_of_appetite'], ['itching', 'nausea'], ['itching', 'high_fever'], ['itchi

**Results of preliminary analysis**

Looking at the table above, we believe that the pairings make sense. These results are the firststep to providing patient-specific diagnoses and disease attribtuion based on the symptom occurences.

It is important to note that our frequent pairings are based on each unique disease and its associated set of symptoms. As we progress through the development of our project, we aim to execute the algorithm to work with patient-centric data, where we can take their symptoms and identify their illness based on those symptoms, as well as which “precautions” or treatments to take afterwards.

**Part 2 - Generating likely diseases**

In [12]:
# Generate frequent item (symptom) pairs from frequent symptom singles using a 
# 25 percentile item occurence threshold
# Inspired by and modified Homework 5 implementation

frequency_table = generateFrequentSingles(diseases, 25)

frequent_items = [i for i in frequency_table.keys()]
C2 = np.zeros((len(frequent_items), len(frequent_items)))

trips = list()
for index_1 in range(0, len(frequent_items)):
    for index_2 in range(index_1 + 1, len(frequent_items)):
            trips.append((frequent_items[index_1], frequent_items[index_2], 0))
            
for d in diseases.keys():
    for i, t in enumerate(trips):
        if (t[0] in diseases[d]) and (t[1] in diseases[d]):
            trips[i] = (t[0], t[1], t[2] + 1)
            
for t in trips:
    C2[frequent_items.index(t[0])][frequent_items.index(t[1])] = t[2]

In [13]:
# print('Frequent Pairs\n')
# print([[t[0], t[1]] for t in trips])
# print('\n-------------------------------------------------------------')
# print('\nFrequent Pairs Matrix\n')
# print(C2)
# print('\n-------------------------------------------------------------\n')

# 15 most frequent pairs
# print('15 Most frequent symptom pairs sorted by number of occurences: \n')
# for t in sorted(trips, key = lambda x: x[2], reverse = True)[0:15]:
#     print("{",t[0],",",t[1],"}, ", t[2])

In [14]:
# Linking frequent pairs with their associated diseases 
freq_pairs_to_diseases = {}

for t in trips:
    # If a pair actually occurs
    if t[2] != 0:
        freq_pairs_to_diseases[t[0] + ',' + t[1]] = [d for d in diseases.keys() if t[0] and t[1] in diseases[d]]

In [15]:
# Finds an associated disease based on any two symptom pairings provided by the user
def findDisease(s1, s2):
    key = s1 + ',' + s2
    if (key in freq_pairs_to_diseases.keys()):
        return freq_pairs_to_diseases[key]
    
    key = s2 + ',' + s1
    if (key in freq_pairs_to_diseases.keys()):
        return freq_pairs_to_diseases[key]
    
    return None

# Uses the three user-provided symptoms combinations and obtains a list of associated diseases
def parseSymptoms(user_symptoms):
    potential_diseases01 = findDisease(user_symptoms[0], user_symptoms[1])
    potential_diseases12 = findDisease(user_symptoms[1], user_symptoms[2])
    potential_diseases02 = findDisease(user_symptoms[0], user_symptoms[2])
    return [potential_diseases01, potential_diseases12, potential_diseases02]

def main():
    # Parsing user input and cleaning the strings for leading spaces
    user_symptoms = input('Please enter three of your symptoms seperated by commas: ') # user input
    user_symptoms = user_symptoms.split(',')
    user_symptoms = [s.lstrip() for s in user_symptoms]
    
    try: 
        print('Based on your symptoms, it appears that you may have one of the following: \n')
        # Obtain the intersection of the three symptoms to get a list of likely diseases
        print(set(parseSymptoms(user_symptoms)[0]) & \
              set(parseSymptoms(user_symptoms)[1]) & \
              set(parseSymptoms(user_symptoms)[2]))
    except:
        # If no disease was associated with the triple symptom combinations
        print('There is no general consensus for your set of symptoms. Please see a doctor!')

In [16]:
main()

Please enter three of your symptoms seperated by commas: headache, fatigue, nausea
Based on your symptoms, it appears that you may have one of the following: 

{'Chicken pox', 'Dengue', 'Common Cold', 'Hypoglycemia', 'Typhoid'}


In [17]:
# Confidence and interest calculations - not particularly helpful given the small number of associations
# in a dataset of this size
# Inspired by and modified Homework 5 implementation

confidence_vomiting_nausea = C2[frequent_items.index('vomiting')][frequent_items.index('nausea')] / np.sum([1 if 'vomiting' in diseases[d] else 0 for d in diseases.keys()])
int_vomiting_nausea = confidence_vomiting_nausea - np.sum(['nausea' in diseases[d] for d in diseases.keys()]) / len(diseases.keys())

confidence_nausea_vomiting = C2[frequent_items.index('vomiting')][frequent_items.index('nausea')]\
            / np.sum([1 if 'nausea' in diseases[d] else 0 for d in diseases.keys()])
int_nausea_vomiting = confidence_vomiting_nausea - np.sum(['vomiting' in diseases[d] for d in diseases.keys()]) / len(diseases.keys())


confidence_hf_fatigue = C2[frequent_items.index('high_fever')][frequent_items.index('fatigue')] / np.sum([1 if 'high_fever' in diseases[d] else 0 for d in diseases.keys()])
int_hf_fatigue = confidence_vomiting_nausea - np.sum(['fatigue' in diseases[d] for d in diseases.keys()]) / len(diseases.keys())

confidence_fatigue_hf = C2[frequent_items.index('high_fever')][frequent_items.index('fatigue')] / np.sum([1 if 'fatigue' in diseases[d] else 0 for d in diseases.keys()])
int_fatigue_hf = confidence_vomiting_nausea - np.sum(['high_fever' in diseases[d] for d in diseases.keys()]) / len(diseases.keys())



print('Confidence(Vomiting --> Nausea): ', confidence_vomiting_nausea)
print('Interest(Vomiting --> Nausea): ', int_vomiting_nausea, '\n')
print('Confidence(Nausea --> Vomiting): ', confidence_nausea_vomiting)
print('Interest(Nausea --> Vomiting): ', int_nausea_vomiting, '\n')

print('\n-------------------------------------------------------------\n')

print('Confidence(High Fever --> Fatigue): ', confidence_hf_fatigue)
print('Interest(High Fever --> Fatigue): ', int_hf_fatigue, '\n')
print('Confidence(Fatigue --> High Fever): ', confidence_fatigue_hf)
print('Interest(Fatigue --> High Fever): ', int_fatigue_hf, '\n')

Confidence(Vomiting --> Nausea):  0.5294117647058824
Interest(Vomiting --> Nausea):  0.2855093256814921 

Confidence(Nausea --> Vomiting):  0.9
Interest(Nausea --> Vomiting):  0.11477761836441897 


-------------------------------------------------------------

Confidence(High Fever --> Fatigue):  0.75
Interest(High Fever --> Fatigue):  0.11477761836441897 

Confidence(Fatigue --> High Fever):  0.5294117647058824
Interest(Fatigue --> High Fever):  0.23672883787661408 

