# Scraping WebMD Drug Reviews
## Introduction
While documents from medical encounters are safeguarded by laws such as HIPAA in the US, there are a number of public websites where patients are nonetheless sharing health information that may prove valuable as a data source. One such site is WebMD, which provides a database of prescription drugs and solicits reviews from patients about their experience with the medications.

Python's `requests` and `BeautifulSoup` libraries make it relatively simple to scrape data from any webpage, and this notebook can be used with only a few modifications to collect user reviews for any medication in WebMD's database for use in downstream analysis. Each review consists of demographic information about the patient, a set of ratings on a scale of 1 to 5 stars, and unstructured text.

This notebook focuses on compiling reviews for various medications used to treat depression. Because psychoactive medication works to alter people's thoughts, feelings, and behavior, firsthand narratives written by patients taking these medications are especially valuable as a source of insight into how well a treatment worked and possible explanations as to why.

## Setup

In [1]:
# imports
import requests
import numpy as np
import pandas as pd
import regex as re

## Functions

In [None]:
# regular expressions for parsing data from a single review
# elements found in review-details div
def regex_date(review):
    '''Parses the date of the review in format dd/mm/yyyy'''
    return re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]

def regex_condition(review):
    '''Parses the condition for which the medication is used'''
    condition_element = review.find('strong', class_='condition')
    condition_listed = condition_element is not None
    if condition_listed:
        # TODO: be able to match ''"Change of Life" Signs' condition
        condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', condition_element.text)
    return condition_match[0] if (condition_listed and len(condition_match) > 0) else np.nan

def regex_rating_overall(review):
    '''Parses the overall rating, the average of 3 categories'''
    rating_overall_line = review.find('div', class_='overall-rating').strong.text
    return re.findall(r'\d+.\d+', rating_overall_line)

def regex_rating_category(review, ind_cat):
    '''Parses the rating for the category at index ind_cat in ['effectiveness', 'ease_of_use', 'satisfaction']'''
    rating_categories = review.find('div', class_='categories').find_all('section')
    div = rating_categories[ind_cat].find('div', class_='webmd-rate on-mobile')
    return int(div.get('aria-valuenow'))

def regex_text(review):
    '''Parses the free response text review for the drug'''
    text_line = review.find('p', class_='description-text')
    return text_line.text if text_line is not None else np.nan

In [None]:
# regular expressions for parsing data from a single review
# elements found in details div
def regex_age(details):
    '''Parses the age of the medication user'''
    age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
    return age_match[0] if len(age_match) > 0 else np.nan

def regex_gender(details):
    '''Parses the gender of the medication user'''
    gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
    return gender_match[0] if len(gender_match) > 0 else np.nan

def regex_time(details):
    '''Parses the duration of time on drug'''
    time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
    return time_match[0] if len(time_match) > 0 else np.nan

def regex_reviewer(details):
    '''Parses the type of reviewer'''
    reviewer_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s*$)', details)
    return reviewer_match[0] if len(reviewer_match) > 0 else np.nan

In [None]:
# parse the reviews on a single webpage
def parse_reviews_page(reviews_html, drug_name, reviews_df):
    '''Populates reviews_df data frame with records from 1 page's reviews
    
    Parameters:
    reviews_html (str): HTML for the webpage extracted using BeautifulSoup
    drug_name (str): the name of the drug being reviewed
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    
    # loop over reviews from a single page
    for i, review in enumerate(reviews_html):
        to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])
        
        details = review.find('div', class_='details').text

        to_append['drug_name'] = drug_name
        to_append['date'] = regex_date(review)
        to_append['age'] = regex_age(details)
        to_append['gender'] = regex_gender(details)
        to_append['time_on_drug'] = regex_time(details)
        to_append['reviewer_type'] = regex_reviewer(details)
        to_append['condition'] = regex_condition(review)
        to_append['rating_overall'] = regex_rating_overall(review)
        
        for ind_cat, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
            to_append[f'rating_{cat}'] = regex_rating_category(review)
    
        to_append['text'] = regex_text(review)
        reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
        
    return reviews_df

In [None]:
# crawl over the review pages for one drug
def crawl_reviews_pages(reviews_df):
    '''Crawls a drug's reviews page-by-page, saving each page's reviews into reviews_df
    
    Parameters:
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    
    counter = 0
    curr_page = 0
    pages_left = True
    while pages_left:
        curr_page += 1

        if (curr_page % 15) == 0:
            print(f'Scraping page {curr_page} of {last_page}')

        curr_url = review_url + f'&page={curr_page}'
        response = requests.get(curr_url, headers=headers).content
        soup = BeautifulSoup(response, 'lxml')
        reviews_page_html = soup.find_all('div', class_='review-details') # get elements that hold each review
        pages = soup.find('ul', class_='pagination')
        last_page = int(pages.find_all('li', class_='page-item')[-1].text.strip())

        page_title = soup.title.text
        drug_name = re.findall(r'(.*)(?=\sReviews)', page_title)[0]

        reviews_df = parse_reviews_page(reviews_page_html, drug_name, reviews_df)

        if curr_page >= last_page:
            pages_left = False
            
    return reviews_df

## Scrape Names of Depression Drugs from WebMD List

In [None]:
# need to spoof a browser in order to not get blocked when making request
# https://bar.rady.ucsd.edu/Web_Scraping.html
from bs4 import BeautifulSoup

headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

In [None]:
# list of depression drugs from WebMD
url = 'https://www.webmd.com/depression/depression-medications-antidepressants'
response = requests.get(url, headers=headers).content
soup = BeautifulSoup(response, 'lxml')
drugs_section = soup.find('div', class_='article-page active-page')

In [None]:
# make empty data frame to contain values and full text from each review
cols = ['drug_name',
       'date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])

# TODO: change parsing so that it looks for reviews.webmd.com in case ordering of tabs changes
for drug in drugs_section.find_all('p'):
    link = drug.a.get('href')
    response = requests.get(link, headers=headers, allow_redirects=True)
    
    # if the link on the depression medications page redirects to a generic page (no dedicated page for the drug exists), skip it
    new_link = response.url
    redirected_links = ['https://www.webmd.com/depression/optimizing-depression-medicines',
                       'https://www.webmd.com/drugs/2/index']
    if new_link in redirected_links:
        print(f'skipping 1 drug with link {new_link}')
        continue
        
    # the link for Vraylar leads to search results, but with only 1 result - load that result page
    if new_link == 'https://www.webmd.com/drugs/2/search?type=drugs&query=vraylar':
        vraylar_link = 'https://www.webmd.com/drugs/2/drug-170027/vraylar-oral/details'
        response = requests.get(vraylar_link, headers=headers, allow_redirects=True)
    
    drug_page = BeautifulSoup(response.content, 'lxml')
    
    drug_review_element = drug_page.find('ul', class_='auto-tabs').find_all('li')[-1] # get the data for the last tab, Reviews
    review_url = drug_review_element.a.get('href')
        
    print(review_url)
    # reviews_df = crawl_reviews_pages(reviews_df)
    
reviews_df.to_csv('psychiatric_drug_webmd_reviews.csv')