# VIRTUAL MEDIC CHATBOT FOR DISEASE AND DRUG CLASSIFICATION

## PRESENTED BY:

## Shyam R -19MIC0017
## Niketha S-19MIC0035
## Nivethitha S-19MIC0030
## Ram Gnaneshwaran -19MIC0104
## Nihaal Ahmed -19MIC0038

## AIM:

## In this project, we aim to build an end-to-end open source information retriveval medical chatbot
## Our chatbot incorporates best practices in IR with a transformer-based reader to give answers quickly and efficiently from a huge corpus of medically related data.


In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shyamr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shyamr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split,cross_val_score
import math
import operator
import pickle
import re
from nltk.stem import WordNetLemmatizer
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from statistics import mean
from nltk.corpus import wordnet 
import requests
from bs4 import BeautifulSoup
from itertools import combinations
from time import time
from collections import Counter
import operator
import warnings
from Treatment import diseaseDetail
warnings.simplefilter("ignore")

In [3]:
df=pd.read_csv("Dataset/dis_sym_dataset_norm.csv")
documentname_list=list(df['label_dis'])
df=df.iloc[:,1:]
columns_name=list(df.columns)
documentname_list=list(documentname_list)

N=len(df)
M=len(columns_name)
idf={}
for col in columns_name:
  temp=np.count_nonzero(df[col])
  idf[col]=np.log(N/temp)
tf={}
for i in range(N):
  for col in columns_name:
    key=(documentname_list[i],col)
    tf[key]=df.loc[i,col]
tf_idf={}
for i in range(N):
  for col in columns_name:
    key=(documentname_list[i],col)
    tf_idf[key]=float(idf[col])*float(tf[key])

D = np.zeros((N, M),dtype='float32')
for i in tf_idf:
    sym = columns_name.index(i[1])
    dis=documentname_list.index(i[0])
    D[dis][sym] = tf_idf[i]

def cosine_dot(a, b):
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0
    else:
        temp = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        return temp


def convert_tolowercase(data):
    return data.lower()


def regextokenizer_func(data):
    tokenizer = RegexpTokenizer(r'\w+')
    data = tokenizer.tokenize(data)
    return data


def gen_vector(tokens):
    Q = np.zeros(M)
    counter = Counter(tokens)
    query_weights = {}
    for token in np.unique(tokens):
        tf = counter[token]
        try:
          idf_temp=idf[token]
        except:
          pass
        try:
            ind = columns_name.index(token)
            Q[ind] = tf*idf_temp
        except:
            pass
    return Q


def tf_idf_score(k, query):
    query_weights = {}
    for key in tf_idf:
        if key[1] in query:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
  
    l = []
    for i in query_weights[:k]:
        l.append(i)
    return l

 
def cosine_similarity(k, query):
    d_cosines = []
    query_vector = gen_vector(query)
    for d in D:
        d_cosines.append(cosine_dot(query_vector, d))
    out = np.array(d_cosines).argsort()[-k:][::-1]
  
    final_display_disease={}
    for lt in set(out):
      final_display_disease[lt] = float(d_cosines[lt])
    return final_display_disease

In [4]:

def synonyms(term):
    synonyms = []
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.content,  "html.parser")
    try:
        container=soup.find('section', {'class': 'MainContentContainer'}) 
        row=container.find('div',{'class':'css-191l5o0-ClassicContentCard'})
        row = row.find_all('li')
        for x in row:
            synonyms.append(x.get_text())
    except:
        None
    for syn in wordnet.synsets(term):
        synonyms+=syn.lemma_names()
    return set(synonyms)

In [5]:
splitter = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [6]:
df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv") # Disease combination
df_norm = pd.read_csv("Dataset/dis_sym_dataset_norm.csv") # Individual Disease
Y = df_norm.iloc[:, 0:1]
X = df_norm.iloc[:, 1:]
dataset_symptoms = list(X.columns)
diseases = list(set(Y['label_dis']))
diseases.sort()

In [7]:
print("Hello! Welcome to our Virtual-Chatbot")
user_symptoms = str(input("\nEnter symptoms:\n")).lower().split(',')
processed_user_symptoms=[]
for sym in user_symptoms:
    sym=sym.strip()
    sym=sym.replace('-',' ')
    sym=sym.replace("'",'')
    sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym)])
    processed_user_symptoms.append(sym)  

Hello! Welcome to our Virtual-Chatbot

Enter symptoms:
headache,body pain,


In [8]:
user_symptoms = []
for user_sym in processed_user_symptoms:
    user_sym = user_sym.split()
    str_sym = set()
    for comb in range(1, len(user_sym)+1):
        for subset in combinations(user_sym, comb):
            subset=' '.join(subset)
            subset = synonyms(subset) 
            str_sym.update(subset)
    str_sym.add(' '.join(user_sym))
    user_symptoms.append(' '.join(str_sym).replace('_',' '))
print("Analysing the Symptoms entered")
print(user_symptoms)

Analysing the Symptoms entered
['concern cephalalgia headache vexation head ache worry', 'pain in the ass pain soundbox trunk personify body pain eubstance ail pain sensation painful sensation consistence torso annoyance consistency physical structure pain in the neck painfulness infliction body dead body hurting botheration anguish nuisance hurt organic structure trouble bother', '']


In [9]:
found_symptoms = set()
for idx, data_sym in enumerate(dataset_symptoms):
    data_sym_split=data_sym.split()
    for user_sym in user_symptoms:
        count=0
        for symp in data_sym_split:
            if symp in user_sym.split():
                count+=1
        if count/len(data_sym_split)>0.5:
            found_symptoms.add(data_sym)
found_symptoms = list(found_symptoms)

In [10]:
print("Matching symptoms from the data entered")
for idx, symp in enumerate(found_symptoms):
    print(idx,":",symp)

select_list = input("\n Hello User! Select more relevant symptoms \n").split()

dis_list = set()
final_symp = [] 
counter_list = []
for idx in select_list:
    symp=found_symptoms[int(idx)]
    final_symp.append(symp)
    dis_list.update(set(df_norm[df_norm[symp]==1]['label_dis']))
   
for dis in dis_list:
    row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()
    row[0].pop(0)
    for idx,val in enumerate(row[0]):
        if val!=0 and dataset_symptoms[idx] not in final_symp:
            counter_list.append(dataset_symptoms[idx])
             
dict_symp = dict(Counter(counter_list))
dict_symp_tup = sorted(dict_symp.items(), key=operator.itemgetter(1),reverse=True)   

Matching symptoms from the data entered
0 : neck
1 : painful
2 : headache
3 : trouble sensation

 Hello User! Select more relevant symptoms 
1 2


In [11]:
found_symptoms=[]
count=0
for tup in dict_symp_tup:
    count+=1
    found_symptoms.append(tup[0])
    if count%5==0 or count==len(dict_symp_tup):
        print("\nCommon symptoms:")
        for idx,ele in enumerate(found_symptoms):
            print(idx,":",ele)
        select_list = input("Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:\n").lower().split();
        if select_list[0]=='no':
            break
        if select_list[0]=='-1':
            found_symptoms = [] 
            continue
        for idx in select_list:
            final_symp.append(found_symptoms[int(idx)])
        found_symptoms = []    


Common symptoms:
0 : fever
1 : testicular pain
2 : vomiting
3 : confusion
4 : maculopapular rash
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
0 2 

Common symptoms:
0 : dizziness
1 : nausea
2 : sore throat
3 : muscle weakness
4 : red
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
1 2

Common symptoms:
0 : problem vision
1 : seizure
2 : tiredness
3 : barky cough
4 : muscle joint pain
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
2 4

Common symptoms:
0 : eyestrain
1 : red eye
2 : swollen lymph node
3 : chest pain
4 : dry damp skin
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
-1

Common symptoms:
0 : high body temperature
1 : chill
2 : yellow skin
3 : mental change
4 : vary depending part brain involved
Do you have have of these symptoms?

In [12]:
k = 10

print("Final list of Symptoms used for prediction are : ")
for val in final_symp:
    print(val)

Final list of Symptoms used for prediction are : 
painful
headache
fever
vomiting
nausea
sore throat
tiredness
muscle joint pain


In [13]:
topk1=tf_idf_score(k,final_symp)
topk2=cosine_similarity(k,final_symp)
print(f"\nTop {k} diseases predicted based on TF_IDF Matching :\n")
i = 0
topk1_index_mapping = {}
for key, score in topk1:
  print(f"{i}. Disease : {key} \t Score : {round(score, 2)}")
  topk1_index_mapping[i] = key
  i += 1

select = input("\nTo enter more details about the disease? Enter index of disease or '-1' to discontinue:\n")
if select!='-1':
    dis=topk1_index_mapping[int(select)]
    print()
    print(diseaseDetail(dis))


Top 10 diseases predicted based on TF_IDF Matching :

0. Disease : Influenza 	 Score : 11.37
1. Disease : Mononucleosis 	 Score : 8.73
2. Disease : Dengue 	 Score : 8.11
3. Disease : Chickenpox 	 Score : 7.42
4. Disease : Lyme disease 	 Score : 7.42
5. Disease : Anthrax 	 Score : 7.13
6. Disease : Hepatitis A 	 Score : 7.13
7. Disease : Ebola 	 Score : 6.5
8. Disease : Scarlet fever 	 Score : 6.5
9. Disease : Acute encephalitis syndrome 	 Score : 5.92

To enter more details about the disease? Enter index of disease or '-1' to discontinue:
2

Dengue
Other names -  Dengue, breakbone fever   
Pronunciation -       /  ˈ  d  ɛ  ŋ  ɡ  i  ,   -  ɡ  eɪ  /         
Specialty -  Infectious disease 
Symptoms -  Fever, headache, muscle and joint pain, rash   
Complications -  Bleeding, low levels of blood platelets, dangerously low blood pressure   
Usual onset -  3–14 days after exposure   
Duration -  2–7 days   
Causes -  Dengue virus by  Aedes  mosquitos   
Diagnostic method -  Detecting anti

In [14]:
print(f"Top {k} disease based on Cosine Similarity Matching :\n ")
topk2_sorted = dict(sorted(topk2.items(), key=lambda kv: kv[1], reverse=True))
j = 0
topk2_index_mapping = {}
for key in topk2_sorted:
  print(f"{j}. Disease : {diseases[key]} \t Score : {round(topk2_sorted[key], 2)}")
  topk2_index_mapping[j] = diseases[key]
  j += 1

    
select = input("\nMore details about the disease? Enter index of disease or '-1' to discontinue and close the system:\n")
if select!='-1':
    dis=topk2_index_mapping[int(select)]
    print()
    print(diseaseDetail(dis))

Top 10 disease based on Cosine Similarity Matching :
 
0. Disease : Dengue 	 Score : 0.46
1. Disease : Influenza 	 Score : 0.44
2. Disease : Mononucleosis 	 Score : 0.41
3. Disease : Malaria 	 Score : 0.36
4. Disease : Impetigo 	 Score : 0.32
5. Disease : Lyme disease 	 Score : 0.32
6. Disease : Campylobacter infection 	 Score : 0.27
7. Disease : Diphtheria 	 Score : 0.26
8. Disease : Bubonic plague 	 Score : 0.24
9. Disease : Rocky Mountain spotted fever 	 Score : 0.24

More details about the disease? Enter index of disease or '-1' to discontinue and close the system:
1

Influenza
Other names -  Flu, the flu, Grippe 
Specialty -  Infectious disease 
Symptoms -  Fever, runny nose, sore throat, muscle pain, headache, coughing, fatigue 
Usual onset -  1–4 days after exposure 
Duration -  2–8 days 
Causes -  Influenza viruses 
Prevention -  Hand washing, flu vaccines 
Medication -  Antiviral drugs such as oseltamivir 
Frequency -  3–5 million severe cases per year   
Deaths -  >,290,000–6