In [None]:
import requests
import numpy as np
import pandas as pd
import regex as re

In [None]:
# populate data frame by parsing 1 page of document
def parse_reviews_page(reviews_html, drug_name, reviews_df):
    for i, review in enumerate(reviews_html):
        to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])

        to_append['drug_name'] = drug_name
        to_append['date'] = re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]

        details = review.find('div', class_='details').text

        age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
        to_append['age'] = age_match[0] if len(age_match) > 0 else np.nan

        gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
        to_append['gender'] = gender_match[0] if len(gender_match) > 0 else np.nan

        time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
        to_append['time_on_drug'] = time_match[0] if len(time_match) > 0 else np.nan

        type_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s+$)', details)
        to_append['reviewer_type'] = type_match[0] if len(type_match) > 0 else np.nan    

        condition_element = review.find('strong', class_='condition')
        condition_listed = condition_element is not None
        if condition_listed:
            # TODO: be able to match ''"Change of Life" Signs' condition
            condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', condition_element.text)
        to_append['condition'] = condition_match[0] if (condition_listed and len(condition_match) > 0) else np.nan

        rating_overall_line = review.find('div', class_='overall-rating').strong.text
        to_append['rating_overall'] = re.findall(r'\d+.\d+', rating_overall_line)

        rating_categories = review.find('div', class_='categories').find_all('section')

        for j, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
            div = rating_categories[j].find('div', class_='webmd-rate on-mobile')
            to_append[f'rating_{cat}'] = int(div.get('aria-valuenow'))
    
        text_line = review.find('p', class_='description-text')
        to_append['text'] = text_line.text if text_line is not None else np.nan

        reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
        
    return reviews_df

In [None]:
# crawl page by page and save reviews from each page into reviews_df
def crawl_reviews_pages(reviews_df):
    curr_page = 0
    pages_left = True
    while pages_left:
        curr_page += 1

        if (curr_page % 15) == 0:
            print(f'Scraping page {curr_page} of {last_page}')

        curr_url = review_url + f'&page={curr_page}'
        response = requests.get(curr_url, headers=headers).content
        soup = BeautifulSoup(response, 'lxml')
        reviews_page_html = soup.find_all('div', class_='review-details') # get elements that hold each review
        pages = soup.find('ul', class_='pagination')
        last_page = int(pages.find_all('li', class_='page-item')[-1].text.strip())

        page_title = soup.title.text
        drug_name = re.findall('\w+(?:\s\w+)*(?=\sReviews)', page_title)[0]

        reviews_df = parse_reviews_page(reviews_page_html, drug_name, reviews_df)

        if curr_page >= last_page:
            pages_left = False
            
    return reviews_df

In [None]:
# https://bar.rady.ucsd.edu/Web_Scraping.html

from bs4 import BeautifulSoup

# need to spoof a browser in order to not get blocked when making request
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

# list of depression drugs from WebMD
url = 'https://www.webmd.com/depression/depression-medications-antidepressants'
print(url)
response = requests.get(url, headers=headers).content
soup = BeautifulSoup(response, 'lxml')
drugs_section = soup.find('div', class_='article-page active-page')

In [None]:
# make empty data frame to contain values and full text from each review
cols = ['drug_name',
       'date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])

# TODO: change parsing so that it looks for reviews.webmd.com in case ordering of tabs changes
for drug in drugs_section.find_all('p'):
    link = drug.a.get('href')
    response = requests.get(link, headers=headers).content
    drug_page = BeautifulSoup(response, 'lxml')
    
    drug_review_element = drug_page.find('ul', class_='auto-tabs').find_all('li')[-1] # get the data for the last tab, Reviews
    review_url = drug_review_element.a.get('href')
    print(review_url)
    reviews_df = crawl_reviews_pages(reviews_df)

In [None]:
reviews_df.to_csv('psychiatric_drug_webmd_reviews.csv')