In [51]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string
from nltk.stem import WordNetLemmatizer
from web_scrapping import web_scrape_wikipedia, web_scrape_seattle_children

### Loading raw data

In [52]:
drug_data = pd.read_csv("../data/drug_data.csv")

In [53]:
drug_data.shape

(53766, 6)

### Data Cleaning

In [54]:
# Removing nan values in prescribed for
drug_data['Prescribed_for'].fillna('', inplace=True)
drug_data = drug_data[drug_data['Prescribed_for']!='']

In [55]:
drug_data.shape

(53471, 6)

In [57]:
# cleaning the unnecesary disease type
patterns_to_remove = [
    r"^\d+</span> users found this comment helpful\.$",
    r'Not Listed',
    r"<[^>]+>",
    r'\([^)]*\)|\(.*|.*\)'
]

combined_pattern = '|'.join(patterns_to_remove)
cleaned_df = drug_data[~drug_data['Prescribed_for'].str.contains(combined_pattern, case=False,regex=True)]



In [58]:
cleaned_df.shape

(52957, 6)

In [59]:
disease_list = list(cleaned_df.Prescribed_for.unique())


In [60]:
#  only considering drugs which has medium to high rating
filtered_df = drug_data[(drug_data['Prescribed_for'].isin(disease_list)) & (drug_data['User_Rating']>=5)]

In [61]:
aggregated_df = filtered_df.groupby('Prescribed_for').agg(DrugName_list=('drugName', 'unique')).reset_index(drop=False)

In [62]:
diseases_final = list(aggregated_df['Prescribed_for'])

In [63]:
print("Total Number of unique prescribed_for in the  dataset is {}".format(len(diseases_final)))

Total Number of unique prescribed_for in the  dataset is 620


### Web scraping to retrive symptoms from wikipedia

In [None]:
disease_with_symptoms = web_scrape_wikipedia(diseases_final)

In [64]:
import json

with open("../data/symptoms.json",'r') as file:
    disease_with_symptoms = json.load(file)

In [65]:
print("Number of diseases retrived from wikipedia is {}".format(len(disease_with_symptoms.keys())))

Number of diseases retrived from wikipedia is 536


In [66]:
# extracting diseases that's not been retrieved from wikipedia

unretrived_diseases = [key for key in diseases_final if key not in disease_with_symptoms.keys()]

In [67]:
print("Number of diseases unretrived from wikipedia is {}".format(len(unretrived_diseases)))

Number of diseases unretrived from wikipedia is 84


### Web scraping to retrive remaining disease's symptoms from seattlechildren's

In [68]:
base_url = 'https://www.seattlechildrens.org'

unretrived_diseases_symptoms = web_scrape_seattle_children(base_url, unretrived_diseases)

In [69]:
final_disease_symptoms = {**disease_with_symptoms, **unretrived_diseases_symptoms}

In [70]:
print("Final number of diseases available with symptoms {}".format(len(final_disease_symptoms.keys())))

Final number of diseases available with symptoms 537


### creating final dataframe with all the disease's, symptoms with drug list prescribed_for

In [71]:
prescribed_for,disease, symptoms = [], [],[]
for key, value in disease_with_symptoms.items():
    prescribed_for.append(key)
    disease.append(value[0])
    symptoms.append(value[1])


In [72]:
symptoms_df = pd.DataFrame({'Prescribed_for': prescribed_for, 'Disease':disease, 'Symptoms': symptoms}, index=range(1, len(disease_with_symptoms) + 1))

In [73]:
final_df = aggregated_df.merge(symptoms_df,on='Prescribed_for', how='left')

### Inserting Length column which depicts the length of Symptoms


In [74]:
final_df['Symptoms'] = final_df['Symptoms'].astype(str)
final_df['Symptoms_Length'] = final_df['Symptoms'].apply(len)

### No of Drugs for each disease:

In [75]:
final_df['Number_of_Drugs'] = final_df['DrugName_list'].apply(lambda x: len(x))

In [76]:
final_df['DrugName_list'] = final_df['DrugName_list'].apply(lambda x: ','.join(x))

### Preprocessing symptoms

In [78]:
# Search for "hlist" in the Preprocessed_Symptoms column (as the symptoms was not scrapped properly)
search_result = final_df[final_df['Symptoms'].str.contains('hlist', case=False, na=False)]
# Defining the new list of symptoms for Glaucoma
new_symptoms = "intense eye pain, nausea and vomiting ,a red eye, a headache tenderness around the eyes, seeing rings around lights, blurred vision"

# Locate the row corresponding to Glaucoma disease
glaucoma_row_index = final_df[final_df['Disease'] == 'Glaucoma'].index

# Update the Symptoms column with the new list of symptoms
final_df.loc[glaucoma_row_index, 'Symptoms'] = new_symptoms

In [80]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shobanasiranjeevilu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shobanasiranjeevilu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shobanasiranjeevilu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [81]:
def preprocess_text(text):
    if pd.isnull(text):
        return None
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Preprocessing the symptoms
final_df['Preprocessed_Symptoms'] = final_df['Symptoms'].apply(preprocess_text)

In [83]:
final_df.to_csv("../data/data_processed.csv",index=False)