In [1]:
import requests
import numpy as np
import pandas as pd
import regex as re

In [2]:
# WebMD Lexapro reviews
url = 'https://reviews.webmd.com/drugs/drugreview-63990-lexapro-oral'
response = requests.get(url).content

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response, 'lxml')
reviews_html = soup.find_all('div', class_='review-details') # get elements that hold each review

In [29]:
# make empty data frame to contain values and full text from each review

cols = ['date',
       'age',
       'gender',
       'time_on_drug',
       'reviewer_type',
       'condition',
       'rating_overall',
       'rating_effectiveness',
       'rating_ease_of_use',
       'rating_satisfaction',
       'text']

reviews_df = pd.DataFrame(columns=cols, index=[])


# populate data frame by parsing document
for i, review in enumerate(reviews_html):
    to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])
    
    to_append['date'] = re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]
    
    details = review.find('div', class_='details').text
    
    age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
    to_append['age'] = age_match[0] if len(age_match) > 0 else np.nan
    
    gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
    to_append['gender'] = gender_match[0] if len(gender_match) > 0 else np.nan
    
    time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
    to_append['time_on_drug'] = time_match[0] if len(time_match) > 0 else np.nan

    type_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s+$)', details)
    to_append['reviewer_type'] = type_match[0] if len(type_match) > 0 else np.nan    

    condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', review.find('strong', class_='condition').text)
    to_append['condition'] = condition_match[0] if len(condition_match) > 0 else np.nan
    
    rating_overall_line = review.find('div', class_='overall-rating').strong.text
    to_append['rating_overall'] = re.findall(r'\d+.\d+', rating_overall_line)
        
    rating_categories = review.find('div', class_='categories').find_all('section')
    
    for j, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
        div = rating_categories[j].find('div', class_='webmd-rate on-mobile')
        to_append[f'rating_{cat}'] = int(div.get('aria-valuenow'))
    
    to_append['text'] = review.find('p', class_='description-text').text
    
    reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)
    
    if i > 10:
        break

In [30]:
reviews_df

Unnamed: 0,date,age,gender,time_on_drug,reviewer_type,condition,rating_overall,rating_effectiveness,rating_ease_of_use,rating_satisfaction,text
0,11/2/2023,45-54,Male,2 to less than 5 years,Patient,Depression,4.3,4,5,4,Please ignore comments from multiple users who...
1,10/18/2023,,Female,,Patient,Panic Disorder,2.0,2,3,1,Very disappointed this medication didn’t work ...
2,10/16/2023,25-34,Female,1 to 6 months,Patient,Depression,5.0,5,5,5,This has been the only antidepressant that has...
3,10/6/2023,25-34,Female,2 to less than 5 years,Patient,Depression,3.7,3,5,3,I was taking Lexapro for 3years! I noticed the...
4,10/5/2023,19-24,Male,2 to less than 5 years,Patient,Anxiousness associated with Depression,5.0,5,5,5,It's great for anxiety and mild depression
5,9/28/2023,25-34,Female,6 months to less than 1 year,Patient,,4.7,5,5,4,"Prior to postpartum depression and anxiety, I ..."
6,9/24/2023,35-44,Male,1 to 6 months,Patient,Depression,1.0,1,1,1,I'm better off without drugs. This worked grea...
7,9/23/2023,19-24,Male,less than 1 month,Patient,Depression,1.3,1,2,1,Was given this by my neurologist for depressio...
8,9/16/2023,45-54,Female,2 to less than 5 years,Patient,Repeated Episodes of Anxiety,5.0,5,5,5,Best thing I ever did was reaching out to GP a...
9,9/7/2023,19-24,Female,1 to 6 months,Patient,Panic Disorder,5.0,5,5,5,Lexapro changed my life! It started working al...
