In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def getPages(link):
    u = []
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser') 
        finalPage = soup.find_all('li', class_='page-item')[-1]
        totalPages = int(finalPage.get_text(strip=True))
        for i in range(2,totalPages+1):
            nextPage = '?conditionid=&sortval=1&page={}&next_page=true'.format(i)
            u.append(link+nextPage)  
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
    
    return u

In [3]:
def getWebMD(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')    
        dName = soup.find('header', {'aria-label': True})
        if dName:
            dName = dName.get('aria-label', '').strip()
        else:
            dName = 'N/A'
        drugReview = []
        condition_li = []
        effect_li = []
        ease_li = []
        satisfaction_li = []
        review_details_divs = soup.find_all('div', class_='review-details')
        for review_details_div in review_details_divs:
            
            condition_div = review_details_div.find('strong', class_='condition')
            if condition_div:
                cond = condition_div.get_text(strip=True)
                if cond:
                    condition_li.append(cond)
                else:
                    condition_li.append("N/A")
                
            categories = review_details_div.find('div', class_='categories')
            sections = categories.find_all('section')
            count = 0 
            for section in sections:
                rate = section.find('div', class_='webmd-rate on-mobile')
                rate = rate['aria-valuenow']
                if count == 0:
                    effect_li.append(rate)
                elif count == 1:
                    ease_li.append(rate)
                else:
                    satisfaction_li.append(rate)
                count = count + 1
                    
            description_text_div = review_details_div.find('p', class_='description-text')
            if description_text_div:
                show_sec_span = description_text_div.find('span', class_='showSec')
                hidden_sec_span = description_text_div.find('span', class_='hiddenSec')

                if show_sec_span and hidden_sec_span:
                    merged_text = show_sec_span.get_text(strip=True) + hidden_sec_span.get_text(strip=True)
                    drugReview.append(merged_text)
                else:
                    description_text = description_text_div.get_text(strip=True)
                    drugReview.append(description_text)
            else:
                drugReview.append("N/A")
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")

    drugName = [dName]*len(drugReview)
    drugSource = ['WebMD']*len(drugReview)
    data = {
        'Drug Name' : drugName,
        'Review Source': drugSource,
        'Condition': condition_li,
        'Drug Review': drugReview,
        'Effectiveness': effect_li,
        'Ease of Use': ease_li,
        'Satisfaction': satisfaction_li
    }
    
   
    df = pd.DataFrame(data)    
    return df
    

In [4]:
links = [
#     'https://reviews.webmd.com/drugs/drugreview-1807-elavil-oral',
#     'https://reviews.webmd.com/drugs/drugreview-11571-imitrex-oral',
#     'https://reviews.webmd.com/drugs/drugreview-8440-maxalt-oral',
#     'https://reviews.webmd.com/drugs/drugreview-5400-zomig-oral',
#     'https://reviews.webmd.com/drugs/drugreview-1634-coreg-oral',
#     'https://reviews.webmd.com/drugs/drugreview-6873-lisinopril-oral',
#     'https://reviews.webmd.com/drugs/drugreview-6840-inderal-oral',
#     'https://reviews.webmd.com/drugs/drugreview-17410-actos-oral',
#     'https://reviews.webmd.com/drugs/drugreview-11294-glucophage-oral',
#     'https://reviews.webmd.com/drugs/drugreview-174491-ozempic-subcutaneous',
#     'https://reviews.webmd.com/drugs/drugreview-163054-victoza-2-pak-subcutaneous'
    
    'https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral',
    'https://reviews.webmd.com/drugs/drugreview-1820-pamelor-oral',
    'https://reviews.webmd.com/drugs/drugreview-93242-byetta-subcutaneous',
    'https://reviews.webmd.com/drugs/drugreview-145704-januvia-oral',
    'https://reviews.webmd.com/drugs/drugreview-167025-trulicity-subcutaneous',
    'https://reviews.webmd.com/drugs/drugreview-6671-aldactone-oral',
    'https://reviews.webmd.com/drugs/drugreview-17211-avalide-oral',
    'https://reviews.webmd.com/drugs/drugreview-5310-hydrochlorothiazide-oral'
]

In [5]:
URLs = []
for link in links:
    URLs.append(link)
    u = getPages(link)
    URLs.extend(u)

In [6]:
columns = ['Drug Name', 'Review Source', 'Condition', 'Drug Review', 'Effectiveness', 'Ease of Use', 'Satisfaction']
#columns = ['Drug Name', 'Review Source', 'Drug Review']
df = pd.DataFrame(columns=columns)

In [9]:
for url in URLs:
    print(url)
    drugs = getWebMD(url)
    df = pd.concat([df, drugs], ignore_index=True)

https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=2&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=3&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=4&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=5&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=6&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=7&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=8&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=9&next_page=true
https://reviews.webmd.com/drugs/drugreview-64734-relpax-oral?conditionid=&sortval=1&page=10

https://reviews.webmd.com/drugs/drugreview-167025-trulicity-subcutaneous?conditionid=&sortval=1&page=12&next_page=true
https://reviews.webmd.com/drugs/drugreview-167025-trulicity-subcutaneous?conditionid=&sortval=1&page=13&next_page=true
https://reviews.webmd.com/drugs/drugreview-6671-aldactone-oral
https://reviews.webmd.com/drugs/drugreview-6671-aldactone-oral?conditionid=&sortval=1&page=2&next_page=true
https://reviews.webmd.com/drugs/drugreview-6671-aldactone-oral?conditionid=&sortval=1&page=3&next_page=true
https://reviews.webmd.com/drugs/drugreview-6671-aldactone-oral?conditionid=&sortval=1&page=4&next_page=true
https://reviews.webmd.com/drugs/drugreview-17211-avalide-oral
https://reviews.webmd.com/drugs/drugreview-17211-avalide-oral?conditionid=&sortval=1&page=2&next_page=true
https://reviews.webmd.com/drugs/drugreview-17211-avalide-oral?conditionid=&sortval=1&page=3&next_page=true
https://reviews.webmd.com/drugs/drugreview-17211-avalide-oral?conditionid=&sortval=1&page=4&next_pa

The above corner case is displayed when poeple have reviewed without using text (only number ratings)

In [10]:
df.tail()

Unnamed: 0,Drug Name,Review Source,Condition,Drug Review,Effectiveness,Ease of Use,Satisfaction
3779,hydrochlorothiazide oral,WebMD,Condition: High Blood Pressure,Took this medicine for over 2 years but was ta...,3,4,3
3780,hydrochlorothiazide oral,WebMD,Condition: High Blood Pressure,"dizzyness, frequent urination, impotence",3,2,3
3781,hydrochlorothiazide oral,WebMD,Condition: Accumulation of Fluid Resulting fro...,,4,5,4
3782,hydrochlorothiazide oral,WebMD,Condition: Prevention of Calcium-Containing Ki...,,5,5,4
3783,hydrochlorothiazide oral,WebMD,Condition: Prevention of Calcium-Containing Ki...,I had been troubled with one or two kidney a y...,5,5,5


In [12]:
df.sample(15)

Unnamed: 0,Drug Name,Review Source,Condition,Drug Review,Effectiveness,Ease of Use,Satisfaction
3146,hydrochlorothiazide oral,WebMD,Condition: High Blood Pressure,,4,5,5
2471,Trulicity subcutaneous,WebMD,Condition: Type 2 Diabetes Mellitus,I take my injection on sundays.. Week 1 super ...,3,4,1
2096,Januvia oral,WebMD,Condition: Type 2 Diabetes Mellitus,I have boils on my butt and under my arms. Bef...,1,1,1
3320,hydrochlorothiazide oral,WebMD,Condition: High Blood Pressure,Frequent urination can be bothersome and I str...,3,3,3
672,Byetta subcutaneous,WebMD,Condition: Additional Medication for Diabetes ...,5 mcg worked okay. Used 10 mcg for about 4 wee...,2,1,1
659,Byetta subcutaneous,WebMD,Condition: Additional Medication for Diabetes ...,Gained 40lbs in 2yrs on oral meds. Given Byett...,4,5,4
3714,hydrochlorothiazide oral,WebMD,Condition: Visible Water Retention,"I began taking this medication when ankles, fe...",5,5,4
364,Relpax oral,WebMD,Condition: Migraine Headache,Migraines are usually gone within one hour of ...,5,5,5
695,Byetta subcutaneous,WebMD,Condition: Additional Medication for Diabetes ...,I am worried as I have seen many announcements...,4,4,4
1444,Pamelor oral,WebMD,Condition: Other,I have been taking this med for years for TMJ ...,3,5,2


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3784 entries, 0 to 3783
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Drug Name      3784 non-null   object
 1   Review Source  3784 non-null   object
 2   Condition      3784 non-null   object
 3   Drug Review    3784 non-null   object
 4   Effectiveness  3784 non-null   object
 5   Ease of Use    3784 non-null   object
 6   Satisfaction   3784 non-null   object
dtypes: object(7)
memory usage: 207.1+ KB


In [14]:
df.to_csv('WebMD_drugReviews2.csv')