In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
from IPython.display import HTML
import random
import re
import networkx as nx
# import nltk
# from nltk.corpus import stopwords

%matplotlib inline

In [2]:
df = pd.read_excel('symptoms.xlsx')

# we replace the whitespaces and then the '_' character by an space
for col in df.columns:
    if 'Symptom' in col:
        df[col] = df[col].str.replace(' ', '').str.replace('_', ' ')

In [3]:
df_desc = pd.read_excel('symptoms.xlsx', sheet_name='symptom_Description')
df_prec = pd.read_excel('symptoms.xlsx', sheet_name='symptom_precaution')

# Analyzing the data and building some analysis

In [4]:
symptom_freqs = df.iloc[:,1:].stack().value_counts()
symptom_freqs = pd.DataFrame(symptom_freqs) 
symptom_freqs.index.name = 'Symptom'
symptom_freqs = symptom_freqs.reset_index() 
symptom_freqs = symptom_freqs.rename(columns={'count':'frequency'})


In [5]:
symptoms = list(symptom_freqs['Symptom'].unique())
diseases = list(df['Disease'].unique())

In [6]:
print('There are {} symptoms described in the dataset'.format(len(symptoms)))
print('and also {} diseases described in the dataset'.format(len(diseases)))

There are 131 symptoms described in the dataset
and also 41 diseases described in the dataset


In [7]:
adj_mat = np.zeros((len(symptoms),len(diseases)))

for i in range(len(df)):
    for j in range(1, 18):
        disease = df.iloc[i,0]
        symptom = df.iloc[i,j]
        if pd.notnull(symptom):
            symptom = symptom.strip()  # strip leading and trailing whitespace (after and before each symptom names)
            dis_index = diseases.index(disease)
            sym_index = symptoms.index(symptom)
            adj_mat[sym_index, dis_index] += 1
        else:
            pass

# Naive Bayes Classifier

In [8]:
non_diagnosis_responses = ['I cannot give you a possible diagnosis','Please, try it again','Please, give me more details','I do not understand what you mean']

def bayesian_classifier(adj_mat, symptom_list, symptoms, diseases):
    
    # Use re.sub() to remove the special characters from each symptom in the symptom_list
    cleaned_symptom_list = [re.sub(r'[:;¿?¡!-]', '', s).strip().lower() for s in symptom_list]

    # Convert the cleaned symptom list to indices, assuming the symptoms are found in the cleaned list
    sym = [symptoms.index(s) for s in cleaned_symptom_list if s in symptoms]

    # sym = [symptoms.index(s) for s in symptom_list]
    p_dis = adj_mat.sum(axis=0) / adj_mat.sum()
    p_sym = adj_mat.sum(axis=1) / adj_mat.sum()
    dist = []

    for i in range(len(diseases)):
        # computing the bayes probability
        prob = np.prod((adj_mat[:,i] / adj_mat[:,i].sum())[sym]) * p_dis[i] / np.prod(p_sym[sym])
        dist.append(prob)
    
    if sum(dist) == 0:
        return non_diagnosis_responses[random.randrange(4)]
    else:
        idx = dist.index(max(dist))
        return diseases[idx]

In [9]:
def print_precautions(diseases, df_prec):

    precautions = df_prec[df_prec['Disease'].str.lower() == diseases.lower()].iloc[0]
    print('Recommended precautions:')
    for i in range(1, 5):
        print(f"- {precautions[f'Precaution_{i}']}")

In [10]:
def print_description(disease, df_desc):

    desc = df_desc['Disease'].str.lower() == disease.lower()
    if desc.any():
        description = df_desc.loc[desc, 'Description'].iloc[0]
        print(f'{description}\n')
    else:
        ''

In [11]:
print('Please enter your symptoms separated by commas from the list below:')
# print(", ".join(symptoms))

# Get user input and process it
user_input = input('Enter symptoms: ')
# remove this special characters r'''[.:;¿?¡!'"=+/\[\]{}()`~@$%^&*\d|]'''
cleaned_input = re.sub(r'''[.:;¿?¡!\<>'"=+/\[\]{}()`~@$%^&*|\d\\]''', '', user_input)
cleaned_input = cleaned_input.replace('_', ' ').replace('-', ' ')
user_symptoms = [sym.strip() for sym in cleaned_input.split(',') if sym.strip() in symptoms]

# Check if the user entered symptoms that are in the list
if not user_symptoms:
    non_diagnosis_responses[random.randrange(4)]
else:
    # Call the bayesian_classifier function
    diagnosis = bayesian_classifier(adj_mat, user_symptoms, symptoms, diseases)
    print(f'\n{user_symptoms}')
    print(f'\nThe most likely diagnosis is: {diagnosis}\n')
    print_description(diagnosis, df_desc)
    print_precautions(diagnosis, df_prec)

# I have an continuous sneezing', fatigue"!?, and high fever!
# continuous sneezing,shivering, chills,
# diarrhoea, vomiting

Please enter your symptoms separated by commas from the list below:

['continuous sneezing', 'shivering', 'chills']

The most likely diagnosis is: Allergy

An allergy is an immune system response to a foreign substance that's not typically harmful to your body.They can include certain foods, pollen, or pet dander. Your immune system's job is to keep you healthy by fighting harmful pathogens.

Recommended precautions:
- apply calamine
- cover area with bandage
- nan
- use ice to compress itching
