# Scraping WebMD Drug Reviews
## Introduction
While documents from medical encounters are safeguarded by laws such as HIPAA in the United States, there are a number of public websites where patients are nonetheless sharing health information that may prove valuable as a data source. One such site is WebMD, which provides a database of prescription drugs and solicits reviews from patients about their experience with the medications.

Python's `requests` and `BeautifulSoup` libraries make it relatively simple to scrape data from any webpage, and this notebook can be used with only a few modifications to collect user reviews for any medication in WebMD's database for use in downstream analysis. Each review consists of demographic information about the patient, a set of ratings on a scale of 1 to 5 stars, and unstructured text.

This notebook generates three review datasets. The first dataset is reviews of psychiatric medication. Because psychoactive medication works to alter people's thoughts, feelings, and behavior, firsthand narratives written by patients taking these medications are especially valuable as a source of insight into how well a treatment worked and possible explanations as to why. The second dataset contains reviews for medications used to treat hypertension, a condition affecting an increasing number of adults in the United States and around the world as metabolic disorders become alarmingly common. The third dataset has reviews of medications used to treat type 2 diabetes, another condition under the umbrella of endemic metabolic disorders.

## Setup

In [None]:
# imports
import requests
import numpy as np
import pandas as pd
import regex as re
from tqdm import tqdm # progress bar

In [None]:
# indicate which datasets need to be scraped on this run
scrape_psychiatric = False
scrape_hypertension = True
scrape_diabetes = True

## Functions

In [None]:
# regular expressions for parsing data from a single review
# elements found in review-details div
def regex_date(review):
    '''Parses the date of the review in format dd/mm/yyyy'''
    return re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]

def regex_condition(review):
    '''Parses the condition for which the medication is used'''
    condition_element = review.find('strong', class_='condition')
    condition_listed = condition_element is not None
    if condition_listed:
        # TODO: be able to match ''"Change of Life" Signs' condition
        condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', condition_element.text)
    return condition_match[0] if (condition_listed and len(condition_match) > 0) else np.nan

def regex_rating_overall(review):
    '''Parses the overall rating, the average of 3 categories'''
    rating_overall_line = review.find('div', class_='overall-rating').strong.text
    return re.findall(r'\d+.\d+', rating_overall_line)

def regex_rating_category(review, ind_cat):
    '''Parses the rating for the category at index ind_cat in ['effectiveness', 'ease_of_use', 'satisfaction']'''
    rating_categories = review.find('div', class_='categories').find_all('section')
    div = rating_categories[ind_cat].find('div', class_='webmd-rate on-mobile')
    return int(div.get('aria-valuenow'))

def regex_text(review):
    '''Parses the free response text review for the drug'''
    text_line = review.find('p', class_='description-text')
    return text_line.text if text_line is not None else np.nan

In [None]:
# regular expressions for parsing data from a single review
# elements found in details div
def regex_age(details):
    '''Parses the age of the medication user'''
    age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
    return age_match[0] if len(age_match) > 0 else np.nan

def regex_gender(details):
    '''Parses the gender of the medication user'''
    gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
    return gender_match[0] if len(gender_match) > 0 else np.nan

def regex_time(details):
    '''Parses the duration of time on drug'''
    time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
    return time_match[0] if len(time_match) > 0 else np.nan

def regex_reviewer(details):
    '''Parses the type of reviewer'''
    reviewer_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s*$)', details)
    return reviewer_match[0] if len(reviewer_match) > 0 else np.nan

In [None]:
# parse the reviews on a single webpage
def parse_reviews_page(soup, reviews_df):
    '''Populates reviews_df data frame with records from 1 page's reviews
    
    Parameters:
    reviews_html (str): HTML for the webpage extracted using BeautifulSoup
    drug_name (str): the name of the drug being reviewed
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    reviews_html = soup.find_all('div', class_='review-details') # get elements that hold each review
    drug_name = re.findall(r'(.*)(?=\sReviews)', soup.title.text)[0] # page title is drug name
    
    # loop over reviews from a single page
    for i, review in enumerate(reviews_html):
        to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])
        
        details = review.find('div', class_='details').text

        to_append['drug_name'] = drug_name
        to_append['date'] = regex_date(review)
        to_append['age'] = regex_age(details)
        to_append['gender'] = regex_gender(details)
        to_append['time_on_drug'] = regex_time(details)
        to_append['reviewer_type'] = regex_reviewer(details)
        to_append['condition'] = regex_condition(review)
        to_append['rating_overall'] = regex_rating_overall(review)
        
        for ind_cat, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
            to_append[f'rating_{cat}'] = regex_rating_category(review, ind_cat)
    
        to_append['text'] = regex_text(review)
        reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
        
    return reviews_df

In [None]:
def get_soup(review_url, page):
    curr_url = review_url + f'&page={page}'
    response = requests.get(curr_url, headers=headers).content
    return BeautifulSoup(response, 'lxml')

In [None]:
# crawl over the review pages for one drug
def crawl_reviews_pages(review_url, reviews_df):
    '''Crawls a drug's reviews page-by-page, saving each page's reviews into reviews_df
    
    Parameters:
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    # find how many review pages there are total for the drug by parsing it from the first page
    soup = get_soup(review_url, 1)
    pages = soup.find('ul', class_='pagination')
    
    # proceed only if there is at least 1 review
    if pages is not None:
        last_page = int(pages.find_all('li', class_='page-item')[-1].text.strip())

        for i in tqdm(range(1, last_page+1)):
            soup = get_soup(review_url, 1)
            reviews_df = parse_reviews_page(soup, reviews_df)

    return reviews_df

## Scrape Drugs by Condition from WebMD Search

In [None]:
# need to spoof a browser in order to not get blocked when making request
# https://bar.rady.ucsd.edu/Web_Scraping.html
from bs4 import BeautifulSoup

headers = requests.utils.default_headers()
agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
headers.update({
    'User-Agent': agent,
})

In [None]:
# make empty data frame to contain values and full text from each review
cols = ['drug_name',
       'date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])

In [None]:
def get_review_urls(conditions):
    review_urls = []
    for url in conditions.values():
        response = requests.get(url, headers=headers).content
        soup = BeautifulSoup(response, 'lxml')
        drugs_section = soup.find('div', class_='medication-results-list')

        for drug_review_element in drugs_section.find_all('span', class_='reviews-url'):
            review_urls.append(drug_review_element.a.get('href'))

    # because there is a lot of overlap in medications between conditions, get unique values
    return np.unique(review_urls)

In [None]:
def compile_reviews(reviews_df, review_urls, condition):
    for review_url in tqdm(review_urls):
        response = requests.get(review_url, headers=headers)
        if response.url == 'https://www.webmd.com/404':
            print(f'skipping {review_url}: page not found')
        else:
            print(review_url)
            reviews_df = crawl_reviews_pages(review_url, reviews_df)
    reviews_df.to_csv(f'{condition}_drug_webmd_reviews.csv')

In [None]:
if scrape_psychiatric:
    # use WebMD's search to find drugs by illness
    conditions_psychiatric = {'anxiety': 'https://www.webmd.com/drugs/2/condition-967/anxiety',
                 'depression': 'https://www.webmd.com/drugs/2/condition-1022/depression',
                 'anxiety with depression': 'https://www.webmd.com/drugs/2/condition-978/anxiety-with-depression',
                 'bipolar disorder': 'https://www.webmd.com/drugs/2/condition-954/bipolar-disorder',
                 'schizophrenia': 'https://www.webmd.com/drugs/2/condition-948/schizophrenia'}
    review_urls_psychiatric = get_review_urls(conditions_psychiatric)
    compile_reviews(reviews_df.copy(deep=True), review_urls_psychiatric, 'psychiatric') # deep copy of blank dataframe

In [None]:
if scrape_hypertension:
    conditions_hypertension = {'hypertension': 'https://www.webmd.com/drugs/2/condition-1432/hypertension'}
    review_urls_hypertension = get_review_urls(conditions_hypertension)
    compile_reviews(reviews_df.copy(deep=True), review_urls_hypertension, 'hypertension')

In [None]:
if scrape_diabetes:
    conditions_diabetes = {'type 2 diabetes mellitus': 'https://www.webmd.com/drugs/2/condition-594/type-2-diabetes-mellitus'}
    review_urls_diabetes = get_review_urls(conditions_diabetes)
    compile_reviews(reviews_df.copy(deep=True), review_urls_diabetes, 'diabetes')