*Disease Detection using Symptoms and Treatment recommendation*

In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
from nltk.corpus import wordnet 
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from time import time
from collections import Counter
import operator
from xgboost import XGBClassifier
import math
from sklearn.linear_model import LogisticRegression
warnings.simplefilter("ignore")

Download resources required for NLTK pre-processing

In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]    |   Package basqu

True

*synonyms function* that finds the synonymous terms of a symptom entered by the user.
*Synonyms are searched on Thesaurus.com and NLTK Wordnet*

In [3]:
def synonyms(term):
    synonyms = []
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.content,  "html.parser")
    try:
        container=soup.find('section', {'class': 'MainContentContainer'}) 
        row=container.find('div',{'class':'css-191l5o0-ClassicContentCard'})
        row = row.find_all('li')
        for x in row:
            synonyms.append(x.get_text())
    except:
        None
    for syn in wordnet.synsets(term):
        synonyms+=syn.lemma_names()
    return set(synonyms)

In [4]:
# utlities for pre-processing
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

*df_comb -> Dataframe consisting of dataset generated by combining symptoms for each disease.*

In [6]:
df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")
df_norm = pd.read_csv("Dataset/dis_sym_dataset_norm.csv")


X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

Using *Logistic Regression (LR) Classifier* as it gives better accuracy compared to other classification models 
Cross validation is done on dataset with cv = 5

In [7]:
lr = LogisticRegression()
lr = lr.fit(X, Y)
scores = cross_val_score(lr, X, Y, cv=5)

In [8]:
X = df_norm.iloc[:, 1:]
Y = df_norm.iloc[:, 0:1]

In [9]:
# List of symptoms
dataset_symptoms = list(X.columns)

Symptoms initially taken from user.

In [10]:
user_symptoms = str(input("Please enter symptoms separated by comma(,):\n")).lower().split(',')
# Preprocessing the input symptoms
processed_user_symptoms=[]
for sym in user_symptoms:
    sym=sym.strip()                       # remove leading/trailing white spaces
    sym=sym.replace('-',' ')              # replace hyphen with space
    sym=sym.replace("'",'')               # remove apostrophe
    sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
    processed_user_symptoms.append(sym)

Pre-processing on symptoms entered by user is done.

In [11]:
# Taking each user symptom and finding all its synonyms and 
#appending it to the pre-processed symptom string
user_symptoms = []
for user_sym in processed_user_symptoms:
    user_sym = user_sym.split()
    str_sym = set()
    for comb in range(1, len(user_sym)+1):
        for subset in combinations(user_sym, comb):
            subset=' '.join(subset)
            subset = synonyms(subset) 
            str_sym.update(subset)
    str_sym.add(' '.join(user_sym))
    user_symptoms.append(' '.join(str_sym).replace('_',' '))
# query expansion performed by joining synonyms found for each symptoms initially entered
print("After query expansion done by using the symptoms entered")
print(user_symptoms)

After query expansion done by using the symptoms entered
["fall out tactual sensation blow over lapse ephemeral go on spend feeling care feel passage flavour reach conk go along pass by palpate passing play touch sensation travel by slip away extend communicate egest go past happen similar go through casual passing game same evanesce short-lived hand pass along drop dead transitory pass off feeling like passing sink slide by pass buy the farm choke guide authorise exit overtake perish experience release fade eliminate loss the like expire transcend turn over transient extremely elapse cash in one's chips surpass clear comparable find croak perfunctory overhaul authorize glide by occur die go by give-up the ghost pop off opinion cursory flavor take place kick the bucket put across like overtaking ilk alike impression notion top look fugacious pass on come about passing snuff it touch legislate exceedingly sense corresponding decease smell go excrete slip by spirit the likes of qualifyin

The below procedure is done in order to show the symptom synonmys found for the symptoms entered by the user.

In [12]:
# Loop over all the symptoms in dataset and check its
# similarity score to the synonym string of the user-input 
# symptoms. If similarity>0.5, add the symptom to the final list
found_symptoms = set()
for idx, data_sym in enumerate(dataset_symptoms):
    data_sym_split=data_sym.split()
    for user_sym in user_symptoms:
        count=0
        for symp in data_sym_split:
            if symp in user_sym.split():
                count+=1
        if count/len(data_sym_split)>0.5:
            found_symptoms.add(data_sym)
found_symptoms = list(found_symptoms)

*Asking the user to select the relevant symptoms by entering the corresponding indices.*

In [13]:
print("Top matching symptoms from your search!")
for idx, symp in enumerate(found_symptoms):
    print(idx,":",symp)

select_list = input("\nPlease select the relevant symptoms. Enter indices (separated-space):\n").split()

# Find other relevant symptoms from the dataset based on user symptoms based on the highest co-occurance with the
# ones that is input by the user
dis_list = set()
final_symp = [] 
counter_list = []
for idx in select_list:
    symp=found_symptoms[int(idx)]
    final_symp.append(symp)
    dis_list.update(set(df_norm[df_norm[symp]==1]['label_dis']))
   
for dis in dis_list:
    row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()
    row[0].pop(0)
    for idx,val in enumerate(row[0]):
        if val!=0 and dataset_symptoms[idx] not in final_symp:
            counter_list.append(dataset_symptoms[idx])

Top matching symptoms from your search!
0 : shortness breath
1 : feeling like passing
2 : progressive muscle weakness
3 : loss smell
4 : fatigue
5 : feeling tired time
6 : muscle weakness
7 : feeling tired


Final Symptom list

In [14]:
# Create query vector based on symptoms selected by the user
print("\nFinal list of Symptoms that will be used for prediction:")
sample_x = [0 for x in range(0,len(dataset_symptoms))]
for val in final_symp:
    print(val)
    sample_x[dataset_symptoms.index(val)]=1


Final list of Symptoms that will be used for prediction:
feeling like passing
muscle weakness
feeling tired
shortness breath


here prediction of disease is done

In [15]:
# Predict disease
lr = LogisticRegression()
lr = lr.fit(X, Y)
prediction = lr.predict_proba([sample_x])

Show top k diseases and their probabilities to the user

In [16]:
k = 10
diseases = list(set(Y['label_dis']))
diseases.sort()
topk = prediction[0].argsort()[-k:][::-1]

*Result the list of top k diseases to the user with their prediction probabilities.*

In [17]:
print(f"\nTop {k} diseases predicted based on symptoms")
topk_dict = {}
# Show top k highly probable diseases to the user.
for idx,t in  enumerate(topk):
    match_sym=set()
    row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()
    row[0].pop(0)

    for idx,val in enumerate(row[0]):
        if val!=0:
            match_sym.add(dataset_symptoms[idx])
    prob = (len(match_sym.intersection(set(final_symp)))+1)/(len(set(final_symp))+1)
    prob *= mean(scores)
    topk_dict[t] = prob
j = 0
topk_index_mapping = {}
topk_sorted = dict(sorted(topk_dict.items(), key=lambda kv: kv[1], reverse=True))
for key in topk_sorted:
  prob = topk_sorted[key]*100
  print(str(j) + " Disease name:",diseases[key], "\tProbability:",str(round(prob, 2))+"%")
  topk_index_mapping[j] = key
  j += 1

#input("Press -1 to exit: ")


Top 10 diseases predicted based on symptoms
0 Disease name: Anaemia 	Probability: 89.19%
1 Disease name: Iron Deficiency Anemia 	Probability: 71.35%
2 Disease name: Congestive heart disease 	Probability: 53.51%
3 Disease name: Botulism 	Probability: 53.51%
4 Disease name: Scurvy 	Probability: 53.51%
5 Disease name: Hypotonia 	Probability: 35.68%
6 Disease name: Chronic obstructive pulmonary disease (COPD) 	Probability: 35.68%
7 Disease name: Hepatitis D 	Probability: 35.68%
8 Disease name: Fibromyalgia 	Probability: 35.68%
9 Disease name: Neonatal Respiratory Disease Syndrome(NRDS) 	Probability: 35.68%
