In [4]:
import pandas as pd
from webscrape import web_scrape

In [5]:
drug_data = pd.read_csv("/content/Drug_Data.csv")

In [6]:
drug_data.head()

Unnamed: 0,drugName,Prescribed_for,Drug_Review,User_Rating,Date,Count_of_Reviews
0,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,05-Mar-17,35
4,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


In [7]:
diseases = drug_data.Prescribed_for.unique()

In [48]:
# considering subset of data
subset_of_diseases0 = list(diseases[:])
subset_of_diseases=[]
for i in subset_of_diseases0:
  if type(i)!=float:
    subset_of_diseases.append(i)


In [49]:
subset_of_diseases.sort()
subset_of_diseases

['0</span> users found this comment helpful.',
 '100</span> users found this comment helpful.',
 '105</span> users found this comment helpful.',
 '10</span> users found this comment helpful.',
 '11</span> users found this comment helpful.',
 '12</span> users found this comment helpful.',
 '135</span> users found this comment helpful.',
 '13</span> users found this comment helpful.',
 '14</span> users found this comment helpful.',
 '15</span> users found this comment helpful.',
 '16</span> users found this comment helpful.',
 '17</span> users found this comment helpful.',
 '18</span> users found this comment helpful.',
 '1</span> users found this comment helpful.',
 '20</span> users found this comment helpful.',
 '21</span> users found this comment helpful.',
 '22</span> users found this comment helpful.',
 '23</span> users found this comment helpful.',
 '24</span> users found this comment helpful.',
 '25</span> users found this comment helpful.',
 '27</span> users found this comment he

In [50]:
# clenaing the unnecesary disease type is yet to be processed

disease_list = []
for disease in subset_of_diseases:
  if "</span>" not in disease:
    disease_list.append(disease)


In [51]:
#  only considering drugs which has medium to high rating
filtered_df = drug_data[(drug_data['Prescribed_for'].isin(disease_list)) & (drug_data['User_Rating']>=5)]
aggregated_df = filtered_df.groupby('Prescribed_for').agg(DrugName_list=('drugName', 'unique')).reset_index(drop=False)

In [52]:
aggregated_df

Unnamed: 0,Prescribed_for,DrugName_list
0,ADHD,"[Clonidine, Bupropion, Vyvanse, Dexmethylpheni..."
1,AIDS Related Wasting,[Serostim]
2,Abnormal Uterine Bleeding,"[Ethinyl estradiol / levonorgestrel, Mirena, L..."
3,Abortion,[Misoprostol]
4,Acetaminophen Overdose,[Mucomyst-10]
...,...,...
629,moterol),"[Budesonide / formoterol, Formoterol, Arformot..."
630,mulation) (phenylephrine),[Phenylephrine]
631,tic (mycophenolic acid),[Mycophenolic acid]
632,von Willebrand's Disease,[Desmopressin]


In [53]:
diseases_final = list(aggregated_df['Prescribed_for'])

## Web scraping to retrive symptoms for subset of diseases from Wikipedia

In [54]:
disease_with_symptoms = web_scrape(diseases_final)

In [55]:
prescribed_for, symptoms = [], []

In [56]:
for key, value in disease_with_symptoms.items():
    prescribed_for.append(key)
    symptoms.append(value)

In [57]:
symptoms_df = pd.DataFrame({'Prescribed_for': prescribed_for, 'Symptoms': symptoms}, index=range(1, len(disease_with_symptoms) + 1))

In [58]:
subset_df = aggregated_df.merge(symptoms_df,on='Prescribed_for', how='left')

In [59]:
subset_df

Unnamed: 0,Prescribed_for,DrugName_list,Symptoms
0,ADHD,"[Clonidine, Bupropion, Vyvanse, Dexmethylpheni...",Inattention carelessness hyperactivity executi...
1,AIDS Related Wasting,[Serostim],Early : Flu-like illness Later : Large lymph n...
2,Abnormal Uterine Bleeding,"[Ethinyl estradiol / levonorgestrel, Mirena, L...","Irregular, abnormally frequent, prolonged, or ..."
3,Abortion,[Misoprostol],
4,Acetaminophen Overdose,[Mucomyst-10],"Early : Non specific, feeling tired, abdominal..."
...,...,...,...
629,moterol),"[Budesonide / formoterol, Formoterol, Arformot...",
630,mulation) (phenylephrine),[Phenylephrine],
631,tic (mycophenolic acid),[Mycophenolic acid],
632,von Willebrand's Disease,[Desmopressin],Easy and prolonged bleeding


In [84]:
# Filter out rows with NaN values in 'Symptoms' column
nan_symptoms_df = subset_df[subset_df['Symptoms'].isna()]

# Select only the desired columns excluding 'Symptoms'
new_df = nan_symptoms_df[['Prescribed_for', 'DrugName_list']]

# Reset index if needed
new_df.reset_index(drop=True, inplace=True)

# Display the new DataFrame
new_df

Unnamed: 0,Prescribed_for,DrugName_list
0,Abortion,[Misoprostol]
1,Agitation,"[Olanzapine, Loxapine, Citalopram]"
2,Amenorrhea,"[Medroxyprogesterone, Provera, Depo-Provera, N..."
3,Anesthesia,"[Propofol, Lidocaine, Diprivan, Rocuronium, Ke..."
4,Aphthous Ulce,"[Dexamethasone, Triamcinolone, Benzocaine]"
...,...,...
130,moterol / mometasone),[Formoterol / mometasone]
131,moterol),"[Budesonide / formoterol, Formoterol, Arformot..."
132,mulation) (phenylephrine),[Phenylephrine]
133,tic (mycophenolic acid),[Mycophenolic acid]


## Webscrapping the undiscovered symptoms from seattlechildrens.org

In [72]:
pip install beautifulsoup4



In [73]:
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.seattlechildrens.org'

In [78]:
diseases = list(new_df['Prescribed_for'])
symptoms = {}

In [80]:
url = base_url + '/conditions/a-z/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
disease_links = []

In [81]:
for link in soup.find_all('a'):
  if link.text in diseases:
    disease_links.append(link.get('href'))
    print(disease_links)

['/conditions/a-z/diaper-rash/']
['/conditions/a-z/diaper-rash/', '/conditions/a-z/sunburn/']


In [86]:
import re
def get_symptoms(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.content, 'html.parser')
  headings = soup.find_all(re.compile('h[1-6]'), text=re.compile('.*Symptom.*'))
  for h in headings:
    ul = h.find_next('ul')
    if ul:
      symptoms = [li.text for li in ul.find_all('li')]
    return symptoms
  return None
  # Extract symptoms from each disease page
for link in disease_links:
  page = requests.get(base_url + link)
  soup = BeautifulSoup(page.content, 'html.parser')
  disease = soup.find('h1').text
  symptoms[disease] = get_symptoms(base_url + link) # Remove diseases with None values to_remove = [] for disease, symptom_list in symptoms.items(): if symptom_list is None: to_remove.append(disease) for disease in to_remove: del symptoms[disease] print(symptoms)


In [88]:
print(symptoms)



In [89]:
import pandas as pd
processed_symptoms = {}
for disease, symptom_list in symptoms.items():
  for symptom in symptom_list:
    processed_symptoms[disease] = symptom
    df = pd.DataFrame.from_dict(processed_symptoms, orient='index')
    df = df.reset_index().rename(columns={'index':'Disease', 0:'Symptoms'})
print(df)

       Disease                                           Symptoms
0  Diaper Rash  Pink rashes are not painful, but raw ones can ...
1      Sunburn  Lesson: if you think your child got too much s...


## Merging both the dataframes of dataset obtained from wikipedia and seattlechildrens.org

In [91]:
# Merge df and subset_df on Disease and Prescribed_for
merged_df = pd.merge(subset_df, df, how='left', left_on='Prescribed_for', right_on='Disease')

# Update Symptoms in subset_df where Disease matches Prescribed_for
merged_df.loc[merged_df['Symptoms_x'].isnull(), 'Symptoms_x'] = merged_df['Symptoms_y']

# Drop the redundant 'Symptoms_y' column
merged_df.drop(columns=['Symptoms_y'], inplace=True)

# Rename 'Symptoms_x' column to 'Symptoms'
merged_df.rename(columns={'Symptoms_x': 'Symptoms'}, inplace=True)

# Update subset_df with merged_df
subset_df = merged_df[['Prescribed_for', 'DrugName_list', 'Symptoms']]

subset_df

Unnamed: 0,Prescribed_for,DrugName_list,Symptoms
0,ADHD,"[Clonidine, Bupropion, Vyvanse, Dexmethylpheni...",Inattention carelessness hyperactivity executi...
1,AIDS Related Wasting,[Serostim],Early : Flu-like illness Later : Large lymph n...
2,Abnormal Uterine Bleeding,"[Ethinyl estradiol / levonorgestrel, Mirena, L...","Irregular, abnormally frequent, prolonged, or ..."
3,Abortion,[Misoprostol],
4,Acetaminophen Overdose,[Mucomyst-10],"Early : Non specific, feeling tired, abdominal..."
...,...,...,...
629,moterol),"[Budesonide / formoterol, Formoterol, Arformot...",
630,mulation) (phenylephrine),[Phenylephrine],
631,tic (mycophenolic acid),[Mycophenolic acid],
632,von Willebrand's Disease,[Desmopressin],Easy and prolonged bleeding
