Disease Prediction and Drug recommendation

In [18]:
import pandas as pd 
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# **Disease Prediction**

In [2]:
# Reading the symptoms dataset
sym_data = pd.read_csv("symptoms_data.csv")

In [3]:
sym_data.head()

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abnormally hard consistency.1,...,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum,yellow sputum.1
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
sym_data = pd.concat([sym_data]*2, ignore_index=True)
cols = sym_data.columns
cols = cols[1:]

In [5]:
x = sym_data[cols]
y = sym_data['Disease']

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## Training the data

In [8]:
features = cols
feature_dict = {}
for i,f in enumerate(features):
    feature_dict[f] = i

In [20]:
mnb = MultinomialNB()
clf_mnb=mnb.fit(x_train,y_train)
print ("Acurracy: ", clf_mnb.score(x_test,y_test))

Acurracy:  0.8878992628992629


In [21]:
input_features = [feature_dict['dizziness'], feature_dict['fatigue'], feature_dict['shortness of breath']]
input_features

[95, 125, 360]

In [22]:
arr =[]
for i in range(len(features)):
  if i == input_features[0]:
    i = int(i/input_features[0])
  elif i == input_features[1]:
    i = int(i/input_features[1])
  elif i == input_features[2]:
    i = int(i/input_features[2])
  else:
    i = 0
  arr.append(i)

In [23]:
arr = np.array(arr).reshape(-1,len(arr))

**Predicted Disease**

In [24]:
predicted_disease = (mnb.predict(arr))
print("The disease predicted based on given symptoms is : " + predicted_disease[0])

The disease predicted based on given symptoms is : anemia


Importing the preprocessed and merged dataset containing the drug names for every disease, the reviews, symptoms, rating and useful count of all the drugs.



# **Sentiment Analysis**

In [14]:


merged_data = pd.read_csv("Merged_Dataset.csv")
merged_data.head()

Unnamed: 0,Drug,Disease,Review,Rating,UsefulCount,Symptoms
0,Olanzapine,schizophrenia,"""This drug saved my life, I had been on almost...",10,6,"['hallucinations auditory', 'hypersomnolence',..."
1,Ziprasidone,schizophrenia,"""Geodon is a very effective drug for me. Comp...",10,33,"['hallucinations auditory', 'hypersomnolence',..."
2,Loxapine,schizophrenia,"""This medicine completely changed my life in t...",10,22,"['hallucinations auditory', 'hypersomnolence',..."
3,Ziprasidone,schizophrenia,"""I&#039;ve been on geodon for about three week...",7,6,"['hallucinations auditory', 'hypersomnolence',..."
4,Abilify,schizophrenia,"""I switched from Risperidal to Abilify two yea...",10,53,"['hallucinations auditory', 'hypersomnolence',..."


**Performing sentiment analysis of drug review using VADER.**

In [15]:
#Importing and installing the necessary library for VADER.

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [16]:
#Making a list of reviews to give it as input for vader analysis.
review_list = list(merged_data.Review)

#Initializing the necessary files.
sentiments = []
positive = []
negative = []
neutral = []
compound = []

In [17]:
for text in review_list:

  #Performing Vader Analysis on each review.
  com = analyser.polarity_scores(text)["compound"]
  pos = analyser.polarity_scores(text)["pos"]
  neu = analyser.polarity_scores(text)["neu"]
  neg = analyser.polarity_scores(text)["neg"]

  #Adding each value to the corresponding array
  positive.append(pos)
  negative.append(neg)
  neutral.append(neu)
  compound.append(com)
  sentiments.append({"Review":text,
                     "Positive": pos,
                     "Negative": neu,
                     "Neutral": neg,
                     "Compound": com})

sentiments_data = pd.DataFrame.from_dict(sentiments)

In [18]:
#Visualizing data from vader analysis.
sentiments_data

Unnamed: 0,Review,Positive,Negative,Neutral,Compound
0,"""This drug saved my life, I had been on almost...",0.180,0.748,0.072,0.8900
1,"""Geodon is a very effective drug for me. Comp...",0.207,0.793,0.000,0.5256
2,"""This medicine completely changed my life in t...",0.033,0.934,0.034,-0.0129
3,"""I&#039;ve been on geodon for about three week...",0.000,0.960,0.040,-0.1875
4,"""I switched from Risperidal to Abilify two yea...",0.038,0.933,0.029,0.1154
...,...,...,...,...,...
8901,"""I was very pleased with Integra F to treat my...",0.136,0.823,0.041,0.7344
8902,"""I have pots syndrome And low blood plasma..\r...",0.023,0.817,0.159,-0.8456
8903,"""Was taking twice a day while I was pregnant. ...",0.170,0.830,0.000,0.6514
8904,"""I had a haematocrit of 18 and I started takin...",0.052,0.924,0.024,0.3182


In [19]:
#Adding the sentiment analysis columns to the merged dataset.
merged_data["Positive"] = positive
merged_data["Negative"] = negative
merged_data["Neutral"] = neutral
merged_data["Compound"] = compound
merged_data["Review_Sentiment"] = ''

In [20]:
#Visualizing the merged dataset with the sentiment analysis results.
merged_data.head()

Unnamed: 0,Drug,Disease,Review,Rating,UsefulCount,Symptoms,Positive,Negative,Neutral,Compound,Review_Sentiment
0,Olanzapine,schizophrenia,"""This drug saved my life, I had been on almost...",10,6,"['hallucinations auditory', 'hypersomnolence',...",0.18,0.072,0.748,0.89,
1,Ziprasidone,schizophrenia,"""Geodon is a very effective drug for me. Comp...",10,33,"['hallucinations auditory', 'hypersomnolence',...",0.207,0.0,0.793,0.5256,
2,Loxapine,schizophrenia,"""This medicine completely changed my life in t...",10,22,"['hallucinations auditory', 'hypersomnolence',...",0.033,0.034,0.934,-0.0129,
3,Ziprasidone,schizophrenia,"""I&#039;ve been on geodon for about three week...",7,6,"['hallucinations auditory', 'hypersomnolence',...",0.0,0.04,0.96,-0.1875,
4,Abilify,schizophrenia,"""I switched from Risperidal to Abilify two yea...",10,53,"['hallucinations auditory', 'hypersomnolence',...",0.038,0.029,0.933,0.1154,


Based on the compound value we can determine whether the overall sentiment of the review is postive, negative or neutral. Below will be the threshold we shall be using for classifying the review sentiment class:

Positive sentiment: compound score >= 0.05

Neutral sentiment : -0.05 < compound score < 0.05

Negative sentiment: compound score <= *-0.05*

In [21]:
merged_data.loc[merged_data['Compound'] >= 0.05, 'Review_Sentiment'] = 'Positive'
merged_data.loc[merged_data['Compound'] <= -0.05, 'Review_Sentiment'] = 'Negative'
merged_data["Review_Sentiment"].replace('', 'Neutral', inplace = True)

In [22]:
#Dropping the columns Positive, Negative, Neutral and Compound.
merged_data = merged_data.drop(columns = ["Positive", "Negative", "Neutral", "Compound"])

In [23]:
# Rearranging the columns in different order.
merged_data = merged_data[["Disease", "Drug", "Symptoms", "Review", "Review_Sentiment", "Rating", "UsefulCount"]]

In [24]:
#Visualizing the merged dataset after adding the sentiment analysis results.
merged_data.head()

Unnamed: 0,Disease,Drug,Symptoms,Review,Review_Sentiment,Rating,UsefulCount
0,schizophrenia,Olanzapine,"['hallucinations auditory', 'hypersomnolence',...","""This drug saved my life, I had been on almost...",Positive,10,6
1,schizophrenia,Ziprasidone,"['hallucinations auditory', 'hypersomnolence',...","""Geodon is a very effective drug for me. Comp...",Positive,10,33
2,schizophrenia,Loxapine,"['hallucinations auditory', 'hypersomnolence',...","""This medicine completely changed my life in t...",Neutral,10,22
3,schizophrenia,Ziprasidone,"['hallucinations auditory', 'hypersomnolence',...","""I&#039;ve been on geodon for about three week...",Negative,7,6
4,schizophrenia,Abilify,"['hallucinations auditory', 'hypersomnolence',...","""I switched from Risperidal to Abilify two yea...",Positive,10,53


In [25]:
#Exporting this merged dataset as csv file.
merged_data.to_csv('Sentiment_analysis.csv', index=False)

## **Weighted average of rating and useful count**

In [26]:
#Reading the dataset with sentiment analysis of reviews
data = pd.read_csv('Sentiment_analysis.csv')

In [27]:
data.head()

Unnamed: 0,Disease,Drug,Symptoms,Review,Review_Sentiment,Rating,UsefulCount
0,schizophrenia,Olanzapine,"['hallucinations auditory', 'hypersomnolence',...","""This drug saved my life, I had been on almost...",Positive,10,6
1,schizophrenia,Ziprasidone,"['hallucinations auditory', 'hypersomnolence',...","""Geodon is a very effective drug for me. Comp...",Positive,10,33
2,schizophrenia,Loxapine,"['hallucinations auditory', 'hypersomnolence',...","""This medicine completely changed my life in t...",Neutral,10,22
3,schizophrenia,Ziprasidone,"['hallucinations auditory', 'hypersomnolence',...","""I&#039;ve been on geodon for about three week...",Negative,7,6
4,schizophrenia,Abilify,"['hallucinations auditory', 'hypersomnolence',...","""I switched from Risperidal to Abilify two yea...",Positive,10,53


In [28]:
#Sorting the data based on drug name
data = data.sort_values(['Drug'])

In [29]:
# Creating a function to calculate weighted average
def wavg(group, avg_name, weight_name):
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()

In [30]:
data.groupby(["Disease", "Drug"]).apply(wavg, "Rating", "UsefulCount")

Disease        Drug           
anemia         Epoetin alfa        4.285714
               Epogen              9.000000
               Ferralet 90        10.000000
               Integra            10.000000
               Integra F           9.000000
                                    ...    
schizophrenia  Trifluoperazine     9.965909
               Vraylar             8.288793
               Ziprasidone         7.776451
               Zyprexa             7.338843
               Zyprexa Zydis       6.304348
Length: 344, dtype: float64

In [31]:
# Creating a dataframe of Drug and its average rating
data_wavg = pd.DataFrame(data.groupby(["Drug"]).apply(wavg, "Rating", "UsefulCount").reset_index())

In [32]:
data_wavg = data_wavg.rename(columns={0: "Rating_Wavg"})

In [33]:
data_wavg.head()

Unnamed: 0,Drug,Rating_Wavg
0,Abilify,7.350122
1,Abilify Maintena,8.714286
2,Acetaminophen / chlorpheniramine,8.0
3,Acetaminophen / phenyltoloxamine,10.0
4,Acetazolamide,6.376068


In [34]:
merged_wavg = pd.merge(data_wavg, data, on='Drug')

In [35]:
# Merging the weighted average column with the dataset
merged_wavg.drop(columns=['Symptoms','Rating'], inplace=True)
merged_wavg = merged_wavg[['Disease','Drug', 'Review', 'Review_Sentiment', 'Rating_Wavg', 'UsefulCount']]

In [36]:
merged_wavg.head()

Unnamed: 0,Disease,Drug,Review,Review_Sentiment,Rating_Wavg,UsefulCount
0,schizophrenia,Abilify,"""I was taking a lot of pills, some of which we...",Positive,7.350122,1
1,schizophrenia,Abilify,"""Excellent I don&#039;t hear voices. I feel no...",Positive,7.350122,52
2,schizophrenia,Abilify,"""I have recently been placed on Abilify 20mg f...",Negative,7.350122,33
3,schizophrenia,Abilify,"""My now 23 yr old son has been on Abilify sinc...",Neutral,7.350122,33
4,schizophrenia,Abilify,"""I was on one Abilify for three years, for sch...",Positive,7.350122,28


In [74]:
merged_wavg['Disease'].unique()

array(['schizophrenia', 'pneumonia', 'osteoporosis', 'obesity',
       'neutropenia', 'melanoma', 'lymphoma', 'influenza', 'hypoglycemia',
       'hyperbilirubinemia', 'hemorrhoids', 'gout', 'glaucoma',
       'gastroenteritis', 'epilepsy', 'diverticulitis', 'dementia',
       'bronchitis', 'asthma', 'anemia'], dtype=object)

# **Drug Recommendation**

In [38]:
#Sorting dataset and grouping by disease
merged_wavg = merged_wavg.sort_values(['Disease','Rating_Wavg'],ascending=False, ignore_index=True).groupby('Disease').head(10060)
merged_wavg

Unnamed: 0,Disease,Drug,Review,Review_Sentiment,Rating_Wavg,UsefulCount
0,schizophrenia,Mellaril,"""Moochie used to take Melleril, was always cal...",Positive,10.000000,22
1,schizophrenia,Stelazine,"""Out of all the medications my son uses, this ...",Positive,10.000000,42
2,schizophrenia,Stelazine,"""This medication saved my life. I had allergic...",Positive,10.000000,38
3,schizophrenia,Stelazine,"""My wife used this, initially higher dose and ...",Positive,10.000000,2
4,schizophrenia,Trifluoperazine,"""My wife used this, initially higher dose and ...",Positive,9.965909,2
...,...,...,...,...,...,...
8901,anemia,Epoetin alfa,"""I have pots syndrome And low blood plasma..\r...",Negative,4.285714,0
8902,anemia,Epoetin alfa,"""Went from 8.5 hemoglobin to 11 and then back ...",Neutral,4.285714,3
8903,anemia,Procrit,"""I have pots syndrome And low blood plasma..\r...",Negative,3.500000,0
8904,anemia,Procrit,"""I have had three injections for my Low red bl...",Positive,3.500000,3


In [39]:
# Exporting the dataset with weighted average
merged_wavg = merged_wavg.to_csv("Drug_Recommender.csv", index=False)
merged_wavg = pd.read_csv("Drug_Recommender.csv")
merged_wavg

Unnamed: 0,Disease,Drug,Review,Review_Sentiment,Rating_Wavg,UsefulCount
0,schizophrenia,Mellaril,"""Moochie used to take Melleril, was always cal...",Positive,10.000000,22
1,schizophrenia,Stelazine,"""Out of all the medications my son uses, this ...",Positive,10.000000,42
2,schizophrenia,Stelazine,"""This medication saved my life. I had allergic...",Positive,10.000000,38
3,schizophrenia,Stelazine,"""My wife used this, initially higher dose and ...",Positive,10.000000,2
4,schizophrenia,Trifluoperazine,"""My wife used this, initially higher dose and ...",Positive,9.965909,2
...,...,...,...,...,...,...
8901,anemia,Epoetin alfa,"""I have pots syndrome And low blood plasma..\r...",Negative,4.285714,0
8902,anemia,Epoetin alfa,"""Went from 8.5 hemoglobin to 11 and then back ...",Neutral,4.285714,3
8903,anemia,Procrit,"""I have pots syndrome And low blood plasma..\r...",Negative,3.500000,0
8904,anemia,Procrit,"""I have had three injections for my Low red bl...",Positive,3.500000,3


In [40]:
# Taking drugs only with positive reviews for recommendation
merged_wavg = merged_wavg.drop( merged_wavg[merged_wavg['Review_Sentiment'] == "Negative"].index)
merged_wavg = merged_wavg.drop( merged_wavg[merged_wavg['Review_Sentiment'] == "Neutral"].index)

In [41]:
merged_wavg.head()

Unnamed: 0,Disease,Drug,Review,Review_Sentiment,Rating_Wavg,UsefulCount
0,schizophrenia,Mellaril,"""Moochie used to take Melleril, was always cal...",Positive,10.0,22
1,schizophrenia,Stelazine,"""Out of all the medications my son uses, this ...",Positive,10.0,42
2,schizophrenia,Stelazine,"""This medication saved my life. I had allergic...",Positive,10.0,38
3,schizophrenia,Stelazine,"""My wife used this, initially higher dose and ...",Positive,10.0,2
4,schizophrenia,Trifluoperazine,"""My wife used this, initially higher dose and ...",Positive,9.965909,2


In [42]:
# Taking predicted disease as input and recommending drug based on highest weighted average and useful count of ratings
groupedByCount = merged_wavg.groupby(['Disease', 'Drug', 'Rating_Wavg'])['UsefulCount'].sum().reset_index()

In [43]:
groupedByCount

Unnamed: 0,Disease,Drug,Rating_Wavg,UsefulCount
0,anemia,Epoetin alfa,4.285714,3
1,anemia,Ferralet 90,10.000000,9
2,anemia,Integra,10.000000,13
3,anemia,Integra F,9.000000,29
4,anemia,Maxaron Forte,5.238095,5
...,...,...,...,...
276,schizophrenia,Trifluoperazine,9.965909,85
277,schizophrenia,Vraylar,8.288793,151
278,schizophrenia,Ziprasidone,7.776451,415
279,schizophrenia,Zyprexa,7.338843,533


In [44]:
groupedByDisease = groupedByCount.groupby('Disease')

Recommending drug based on the disease predicted above

In [45]:
recommended_drug = pd.DataFrame(groupedByDisease.get_group((predicted_disease[0]).lower()).nlargest(3, ['Rating_Wavg', 'UsefulCount']))
recommended_drug

Unnamed: 0,Disease,Drug,Rating_Wavg,UsefulCount
2,anemia,Integra,10.0,13
1,anemia,Ferralet 90,10.0,9
3,anemia,Integra F,9.0,29


In [102]:
print("Advocated drugs for this disease are:\n ", recommended_drug["Drug"].unique())

Advocated drugs for this disease are:
  ['INTEGRA' 'FERRALET 90' 'INTEGRA F']
