In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [8]:
def getPages(link, x):
    u = []
    for i in range(2,x+1):
        nextPage = '?page={}'.format(i)
        u.append(link+nextPage)      
    return u

In [9]:
def getDrugs(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')    
        dName_tag = soup.find('h1')
        dName = dName_tag.text.split()[0] if dName_tag else 'N/A'
        
        drugReview = []
        conditions_li = []
        rating_li = []
        
        div_tags = soup.find_all('div', class_='ddc-comment ddc-box ddc-mgb-2')
        if div_tags:
            for div_tag in div_tags:
                p_tag = div_tag.find('p')
                if p_tag:
                    p_text = p_tag.get_text(strip=True)
                    if p_tag.b:
                        bold_text = p_tag.b.get_text(strip=True)
                        conditions_li.append(bold_text)
                        description_text = p_text.replace(bold_text, '').strip()
                        drugReview.append(description_text)
                    else:
                        conditions_li.append("N/A")
                        drugReview.append(p_text)
                else:
                    print('No <p> tag found within the specified <div> tag.')
                rate = div_tag.find('div', class_='ddc-rating-summary ddc-mgb-1')
                if rate:
                    rate = rate.span.b.get_text(strip=True)
                    rating_li.append(rate)
                else:
                    rating_li.append("N/A")
        else:
            print('No <div> tag found with the specified class.')      
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
    
    #print(len(conditions_li))
    #print(len(drugReview))
    #print(len(rating_li))
    drugName = [dName]*len(drugReview)
    drugSource = ['Drugs.com']*len(drugReview)
    data = {
        'Drug Name' : drugName,
        'Review Source': drugSource,
        'Condition': conditions_li,
        'Drug Review': drugReview,
        'Rating': rating_li
    }
    df = pd.DataFrame(data)    
    return df

In [10]:
links = [
    'https://www.drugs.com/comments/amitriptyline/elavil.html',
    'https://www.drugs.com/comments/sumatriptan/imitrex.html',
    'https://www.drugs.com/comments/rizatriptan/maxalt.html',
    'https://www.drugs.com/comments/zolmitriptan/zomig.html',
    'https://www.drugs.com/comments/carvedilol/coreg.html',
    'https://www.drugs.com/comments/lisinopril/',
    'https://www.drugs.com/comments/propranolol/inderal.html',
    'https://www.drugs.com/comments/pioglitazone/actos.html',
    'https://www.drugs.com/comments/metformin/glucophage.html',
    'https://www.drugs.com/comments/semaglutide/ozempic.html',
    'https://www.drugs.com/comments/liraglutide/victoza.html',
    'https://www.drugs.com/comments/eletriptan/relpax-for-migraine.html',
    'https://www.drugs.com/comments/nortriptyline/pamelor.html',
    'https://www.drugs.com/comments/dulaglutide/trulicity.html',
    'https://www.drugs.com/comments/sitagliptin/januvia.html',
    'https://www.drugs.com/comments/exenatide/byetta.html',
    'https://www.drugs.com/comments/hydrochlorothiazide/',
    'https://www.drugs.com/comments/hydrochlorothiazide-irbesartan/avalide.html',
    'https://www.drugs.com/comments/spironolactone/aldactone.html'
]
#x = [9,9,5,4,2,25,3,2,2,45,31]
x = [5,5,5,4,2,5,3,2,2,5,5,5,2,5,5,4,5,1,2]
# x = [5,5,5,4,2,5,3,2,2,5,5,5,2,5,5,4,5,1,2]

In [11]:
URLs = []
for i,link in enumerate(links):
    URLs.append(link)
    u = getPages(link,x[i])
    URLs.extend(u)

In [12]:
columns = ['Drug Name','Review Source', 'Condition', 'Drug Review', 'Rating']
df = pd.DataFrame(columns=columns)

In [13]:
for url in URLs:
    drugs = getDrugs(url)
    df = pd.concat([df, drugs], ignore_index=True)

In [14]:
df.sample(15)

Unnamed: 0,Drug Name,Review Source,Condition,Drug Review,Rating
1152,Relpax,Drugs.com,,"""I love this medication! It has completely cha...",9.0
496,Coreg,Drugs.com,For High Blood Pressure,"""I have had great results with Coreg . My bloo...",10.0
1591,Hydrochlorothiazide,Drugs.com,For High Blood Pressure,"""I am hypertensive and have been on hydrochlor...",6.0
1325,Trulicity,Drugs.com,"For Diabetes, Type 2","""I just injected Trulicity for the second time...",1.0
1242,Trulicity,Drugs.com,"For Diabetes, Type 2","""I took Trulicity for 3 months and it helped m...",1.0
1486,Byetta,Drugs.com,"For Diabetes, Type 2","""I started Byetta 5mg in January and then uppe...",5.0
188,Imitrex,Drugs.com,For Migraine,"""After I had my son I had horrible migraines, ...",9.0
1655,Hydrochlorothiazide,Drugs.com,For High Blood Pressure,"""This was one of four drugs that was prescribe...",3.0
30,Elavil,Drugs.com,For Interstitial Cystitis,"""Finally have relief of 3 years relentless pai...",9.0
247,Imitrex,Drugs.com,For Migraine,"""I started using Imitrex about two months ago....",9.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Drug Name      1695 non-null   object
 1   Review Source  1695 non-null   object
 2   Condition      1695 non-null   object
 3   Drug Review    1695 non-null   object
 4   Rating         1695 non-null   object
dtypes: object(5)
memory usage: 66.3+ KB


In [16]:
df.to_csv('Drugs_drugReviews.csv')