In [52]:
import pandas as pd
from web_scrapping import web_scrape_wikipedia, web_scrape_seattle_children

### Loading raw data

In [76]:
drug_data = pd.read_csv("../data/drug_data.csv")

In [77]:
drug_data.shape

(53766, 6)

### Data Cleaning

In [119]:
# Removing nan values in prescribed for
drug_data['Prescribed_for'].fillna('', inplace=True)
drug_data = drug_data[drug_data['Prescribed_for']!='']

In [120]:
drug_data.shape

(53471, 6)

In [122]:
# cleaning the unnecesary disease type
patterns_to_remove = [
    r"^\d+</span> users found this comment helpful\.$",
    r'Not Listed',
    r"<[^>]+>",
    r'\([^)]*\)|\(.*|.*\)'
]

combined_pattern = '|'.join(patterns_to_remove)
cleaned_df = drug_data[~drug_data['Prescribed_for'].str.contains(combined_pattern, case=False,regex=True)]



In [124]:
cleaned_df.shape

(52957, 6)

In [125]:
disease_list = list(cleaned_df.Prescribed_for.unique())


In [127]:
#  only considering drugs which has medium to high rating
filtered_df = drug_data[(drug_data['Prescribed_for'].isin(disease_list)) & (drug_data['User_Rating']>=5)]

In [128]:
aggregated_df = filtered_df.groupby('Prescribed_for').agg(DrugName_list=('drugName', 'unique')).reset_index(drop=False)

In [129]:
diseases_final = list(aggregated_df['Prescribed_for'])

In [130]:
print("Total Number of unique prescribed_for in the  dataset is {}".format(len(diseases_final)))

Total Number of unique prescribed_for in the  dataset is 620


### Web scraping to retrive symptoms from wikipedia

In [136]:
disease_with_symptoms = web_scrape_wikipedia(diseases_final)

In [137]:
print("Number of diseases retrived from wikipedia is {}".format(len(disease_with_symptoms.keys())))

Number of diseases retrived from wikipedia is 498


In [138]:
# extracting diseases that's not been retrieved from wikipedia

unretrived_diseases = [key for key in diseases_final if key not in disease_with_symptoms.keys()]

In [139]:
print("Number of diseases unretrived from wikipedia is {}".format(len(unretrived_diseases)))

Number of diseases unretrived from wikipedia is 122


### Web scraping to retrive remaining disease's symptoms from seattlechildren's

In [140]:
base_url = 'https://www.seattlechildrens.org'

unretrived_diseases_symptoms = web_scrape_seattle_children(base_url, unretrived_diseases)

In [141]:
final_disease_symptoms = {**disease_with_symptoms, **unretrived_diseases_symptoms}

In [142]:
print("Final number of diseases available with symptoms {}".format(len(final_disease_symptoms.keys())))

Final number of diseases available with symptoms 500


### creating final dataframe with all the disease's, symptoms with drug list prescribed_for

In [149]:
prescribed_for,disease, symptoms = [], [],[]
for key, value in disease_with_symptoms.items():
    prescribed_for.append(key)
    disease.append(value[0])
    symptoms.append(value[1])


In [151]:
symptoms_df = pd.DataFrame({'Prescribed_for': prescribed_for, 'Disease':disease, 'Symptoms': symptoms}, index=range(1, len(disease_with_symptoms) + 1))

In [152]:
final_df = aggregated_df.merge(symptoms_df,on='Prescribed_for', how='left')

### Inserting Length column which depicts the length of Symptoms


In [153]:
final_df['Symptoms'] = final_df['Symptoms'].astype(str)
final_df['Symptoms_Length'] = final_df['Symptoms'].apply(len)

### No of Drugs for each disease:

In [154]:
final_df['Number_of_Drugs'] = final_df['DrugName_list'].apply(lambda x: len(x))

In [155]:
final_df.head(3)

Unnamed: 0,Prescribed_for,DrugName_list,Disease,Symptoms,Symptoms_Length,Number_of_Drugs
0,ADHD,"[Clonidine, Bupropion, Vyvanse, Dexmethylpheni...",Attention deficit hyperactivity disorder,Inattention carelessness hyperactivity executi...,134,46
1,AIDS Related Wasting,[Serostim],HIV/AIDS,Early : Flu-like illness Later : Large lymph n...,71,1
2,Abnormal Uterine Bleeding,"[Ethinyl estradiol / levonorgestrel, Mirena, L...",Abnormal uterine bleeding,"Irregular, abnormally frequent, prolonged, or ...",83,38


In [158]:
final_df.to_csv("../data/data_processed.csv",index=False)