# Scraping WebMD Psychiatric Drug Reviews
## Introduction

## Setup

In [1]:
import requests
import numpy as np
import pandas as pd
import regex as re

## Functions

In [32]:
def parse_reviews_page(reviews_html, drug_name, reviews_df):
    '''Populates reviews_df data frame with records from 1 page's reviews
    
    Parameters:
    reviews_html (str): HTML for the webpage extracted using BeautifulSoup
    drug_name (str): the name of the drug being reviewed
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    
    for i, review in enumerate(reviews_html):
        to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])

        to_append['drug_name'] = drug_name
        to_append['date'] = re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]

        details = review.find('div', class_='details').text

        age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
        to_append['age'] = age_match[0] if len(age_match) > 0 else np.nan

        gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
        to_append['gender'] = gender_match[0] if len(gender_match) > 0 else np.nan

        time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
        to_append['time_on_drug'] = time_match[0] if len(time_match) > 0 else np.nan

        type_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s*$)', details)
        to_append['reviewer_type'] = type_match[0] if len(type_match) > 0 else np.nan

        condition_element = review.find('strong', class_='condition')
        condition_listed = condition_element is not None
        if condition_listed:
            # TODO: be able to match ''"Change of Life" Signs' condition
            condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', condition_element.text)
        to_append['condition'] = condition_match[0] if (condition_listed and len(condition_match) > 0) else np.nan

        rating_overall_line = review.find('div', class_='overall-rating').strong.text
        to_append['rating_overall'] = re.findall(r'\d+.\d+', rating_overall_line)

        rating_categories = review.find('div', class_='categories').find_all('section')

        for j, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
            div = rating_categories[j].find('div', class_='webmd-rate on-mobile')
            to_append[f'rating_{cat}'] = int(div.get('aria-valuenow'))
    
        text_line = review.find('p', class_='description-text')
        to_append['text'] = text_line.text if text_line is not None else np.nan

        reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
        
    return reviews_df

In [33]:
def crawl_reviews_pages(reviews_df):
    '''Crawls a drug's reviews page-by-page, saving each page's reviews into reviews_df
    
    Parameters:
    reviews_df (pd.DataFrame): dataframe with one row per review
    
    Returns:
    pd.DataFrame: reviews_df dataframe with new records appended
    
    '''
    
    counter = 0
    curr_page = 0
    pages_left = True
    while pages_left:
        curr_page += 1

        if (curr_page % 15) == 0:
            print(f'Scraping page {curr_page} of {last_page}')

        curr_url = review_url + f'&page={curr_page}'
        response = requests.get(curr_url, headers=headers).content
        soup = BeautifulSoup(response, 'lxml')
        reviews_page_html = soup.find_all('div', class_='review-details') # get elements that hold each review
        pages = soup.find('ul', class_='pagination')
        last_page = int(pages.find_all('li', class_='page-item')[-1].text.strip())

        page_title = soup.title.text
        drug_name = re.findall(r'(.*)(?=\sReviews)', page_title)[0]

        reviews_df = parse_reviews_page(reviews_page_html, drug_name, reviews_df)

        if curr_page >= last_page:
            pages_left = False
            
    return reviews_df

## Scrape List of Depression Drugs from WebMD List
sources: https://bar.rady.ucsd.edu/Web_Scraping.html

In [34]:
from bs4 import BeautifulSoup

# need to spoof a browser in order to not get blocked when making request
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

# list of depression drugs from WebMD
url = 'https://www.webmd.com/depression/depression-medications-antidepressants'
print(url)
response = requests.get(url, headers=headers).content
soup = BeautifulSoup(response, 'lxml')
drugs_section = soup.find('div', class_='article-page active-page')

https://www.webmd.com/depression/depression-medications-antidepressants


In [36]:
# make empty data frame to contain values and full text from each review
cols = ['drug_name',
       'date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])

# TODO: change parsing so that it looks for reviews.webmd.com in case ordering of tabs changes
for drug in drugs_section.find_all('p'):
    link = drug.a.get('href')
    response = requests.get(link, headers=headers, allow_redirects=True)
    
    # if the link on the depression medications page redirects to a generic page (no dedicated page for the drug exists), skip it
    new_link = response.url
    redirected_links = ['https://www.webmd.com/depression/optimizing-depression-medicines',
                       'https://www.webmd.com/drugs/2/index']
    if new_link in redirected_links:
        print(f'skipping 1 drug with link {new_link}')
        continue
        
    # the link for Vraylar leads to search results, but with only 1 result - load that result page
    if new_link == 'https://www.webmd.com/drugs/2/search?type=drugs&query=vraylar':
        vraylar_link = 'https://www.webmd.com/drugs/2/drug-170027/vraylar-oral/details'
        response = requests.get(vraylar_link, headers=headers, allow_redirects=True)
    
    drug_page = BeautifulSoup(response.content, 'lxml')
    
    drug_review_element = drug_page.find('ul', class_='auto-tabs').find_all('li')[-1] # get the data for the last tab, Reviews
    review_url = drug_review_element.a.get('href')
        
    print(review_url)
    # reviews_df = crawl_reviews_pages(reviews_df)
    
reviews_df.to_csv('psychiatric_drug_webmd_reviews.csv')

https://reviews.webmd.com/drugs/drugreview-64439-abilify
https://reviews.webmd.com/drugs/drugreview-8647-doxepin-hcl-concentrate
https://reviews.webmd.com/drugs/drugreview-1305-clomipramine-hcl
https://reviews.webmd.com/drugs/drugreview-151973-aplenzin
https://reviews.webmd.com/drugs/drugreview-6171-asendin-tablet
https://reviews.webmd.com/drugs/drugreview-6172-aventyl-capsule
https://reviews.webmd.com/drugs/drugreview-169290-brexpiprazole-tablet
https://reviews.webmd.com/drugs/drugreview-178621-caplyta
https://reviews.webmd.com/drugs/drugreview-1701-citalopram-hbr
https://reviews.webmd.com/drugs/drugreview-91491-cymbalta
https://reviews.webmd.com/drugs/drugreview-6750-desyrel-tablet
https://reviews.webmd.com/drugs/drugreview-4896-effexor-xr
https://reviews.webmd.com/drugs/drugreview-95354-emsam-patch-24-hours
https://reviews.webmd.com/drugs/drugreview-16109-etrafon-tablet
https://reviews.webmd.com/drugs/drugreview-1807-elavil-tablet
https://reviews.webmd.com/drugs/drugreview-9159-ende