In [1]:
import requests
import numpy as np
import pandas as pd
import regex as re

In [58]:
# https://bar.rady.ucsd.edu/Web_Scraping.html

from bs4 import BeautifulSoup

# need to spoof a browser in order to not get blocked when making request
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

# list of depression drugs from WebMD
url = 'https://www.webmd.com/depression/depression-medications-antidepressants'
response = requests.get(url, headers=headers).content



soup = BeautifulSoup(response, 'lxml')
drugs_section = soup.find('div', class_='article-page active-page')

# TODO: change parsing so that it looks for reviews.webmd.com in case ordering of tabs changes
for drug in drugs_section.find_all('p'):
    link = drug.a.get('href')
    response = requests.get(link, headers=headers).content
    drug_page = BeautifulSoup(response, 'lxml')
    
    drug_review_element = drug_page.find('ul', class_='auto-tabs').find_all('li')[-1] # get the data for the last tab, Reviews
    review_url = drug_review_element.a.get('href')
    print(review_url)
    
    break

https://reviews.webmd.com/drugs/drugreview-64439-abilify


In [64]:
soup.title.text

'Abilify Oral Reviews and User Ratings: Effectiveness, Ease of Use, and Satisfaction'

In [69]:
response = requests.get(review_url, headers=headers).content
soup = BeautifulSoup(response, 'lxml')
reviews_html = soup.find_all('div', class_='review-details') # get elements that hold each review

drug_name = re.findall('\w+(?:\s\w+)*(?=\sReviews)', soup.title.text)[0]

In [70]:
# make empty data frame to contain values and full text from each review

cols = ['drug_name',
       'date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])


# populate data frame by parsing document
for i, review in enumerate(reviews_html):
    to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])
    
    to_append['drug_name'] = drug_name
    to_append['date'] = re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]
    
    details = review.find('div', class_='details').text
    
    age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
    to_append['age'] = age_match[0] if len(age_match) > 0 else np.nan
    
    gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
    to_append['gender'] = gender_match[0] if len(gender_match) > 0 else np.nan
    
    time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
    to_append['time_on_drug'] = time_match[0] if len(time_match) > 0 else np.nan

    type_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s+$)', details)
    to_append['reviewer_type'] = type_match[0] if len(type_match) > 0 else np.nan    

    condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', review.find('strong', class_='condition').text)
    to_append['condition'] = condition_match[0] if len(condition_match) > 0 else np.nan
    
    rating_overall_line = review.find('div', class_='overall-rating').strong.text
    to_append['rating_overall'] = re.findall(r'\d+.\d+', rating_overall_line)
        
    rating_categories = review.find('div', class_='categories').find_all('section')
    
    for j, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
        div = rating_categories[j].find('div', class_='webmd-rate on-mobile')
        to_append[f'rating_{cat}'] = int(div.get('aria-valuenow'))
    
    to_append['text'] = review.find('p', class_='description-text').text
    
    reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
    
    if i > 10:
        break

In [71]:
reviews_df

Unnamed: 0,drug_name,date,age,gender,time_on_drug,reviewer_type,condition,rating_overall,rating_effectiveness,rating_ease_of_use,rating_satisfaction,text
0,Abilify Oral,11/12/2023,35-44,Female,less than 1 month,Patient,Additional Medications to Treat Depression,5.0,5,5,5,I started abilify two weeks ago for depression...
1,Abilify Oral,11/6/2023,19-24,Female,6 months to less than 1 year,Patient,Other,3.7,4,5,2,"Great for depression, but it made me have a co..."
2,Abilify Oral,10/2/2023,55-64,Female,1 to 6 months,Patient,Additional Medications to Treat Depression,5.0,5,5,5,I felt a positive affect of level headiness wi...
3,Abilify Oral,9/22/2023,19-24,Male,less than 1 month,Patient,Other,1.3,1,2,1,I was prescribed this in the hospital for bipo...
4,Abilify Oral,9/10/2023,25-34,Male,5 to less than 10 years,Patient,Other,4.3,5,5,3,"gained weight, tiredness, but it works for psy..."
5,Abilify Oral,9/8/2023,35-44,Male,5 to less than 10 years,Patient,Bipolar I Disorder with Most Recent Episode Mixed,1.0,1,1,1,This is the worse Med ever. I dont even know w...
6,Abilify Oral,8/2/2023,45-54,Female,6 months to less than 1 year,Patient,Mania associated with Bipolar Disorder,5.0,5,5,5,This medicine has changed my life. It literall...
7,Abilify Oral,7/29/2023,55-64,Male,,Patient,Additional Medications to Treat Depression,1.0,1,1,1,I never had issues with anxiety until I took t...
8,Abilify Oral,7/5/2023,25-34,Male,1 to 6 months,Patient,Schizophrenia,2.3,3,3,1,Worked really well for like 2 months then I go...
9,Abilify Oral,6/5/2023,65-74,,2 to less than 5 years,Patient,Additional Medications to Treat Depression,4.3,4,4,5,I am a bipolar woman who has been battling dep...
