# Web Scraping python notebook for www.1mg.com

### Author: Savio Sajan M.

In [195]:
import requests # for fetching the html documents
from bs4 import BeautifulSoup # for parsing the fetched html documents
import re # regex for filtering text
import string
import json # to access json objects found in the html files
import pandas as pd # for creating dataframes and csv files

In [None]:
url = "https://www.1mg.com" # root url
base = "/drugs-all-medicines" # base path from root that shows the listing of all drugs

In [None]:
# to get the soup class from a given url and return its html text

def getSoup(url):
    headers = {'Accept-Encoding': 'identity'}
    r = requests.get(url, headers=headers)
    print(r)
    r = r.text
    soup = BeautifulSoup(r, 'html.parser')
    return soup

In [None]:
# function to add custom page varuables to the url to be able to find all webpages that show all the drugs for a specific 
# label(starting letter) and the index in the list of drugs under each label

def getPage(alpha, page):
    return "?page={p}&label={a}".format(p = page, a = alpha)

In [None]:
# listing all labels for the drugs [a-z]

als = list(string.ascii_lowercase)

In [None]:
# function that returns a list of all links to each drug as extracted from the webpages hinted at earlier

def getLinks(divs):
    global als, url, base
    links = []
    #getSoup(url)
    for a in als:
        i = 1
        while(True):
            # for each page under each label
            final = url + base + getPage(a, i)
            r = requests.get(final, headers=headers)
            
            # to check if the get request was a failure
            if("200" not in str(r)):
                break
            else:
                r = r.text
                soup = BeautifulSoup(r, 'html.parser')
                divs = soup.find_all('div', class_='Card__container__liTc5 Card__productCard__SrdLF Card__direction__H8OmP container-fluid-padded-xl')
                for div in divs:
                    # locating the link referenced from each drug card showcased in the webpage
                    a_tag = div.find('a')
                    if a_tag:
                        href = a_tag['href']
                        links.append(href)
                i += 1
    return links
ls = getLinks(divs)

In [246]:
# finding the total number of drugs detected from the website listings

print("Number of drugs found in the website are: " + str(len(ls)))

Number of drugs found in the website are: 217631


### CAUTION: Processing for all drugs takes a very long time, thus this notebook has only been adjusted to the first 60 drugs.

### To adjust the code to be able to analyze all the drugs, check the **second comment** in the below code cell

In [250]:
# function that returns a list of dictionaries that contains info of each drug listed in the website

def getInfo(links):
    global url
    rows = []
    
    # currently set for finding details for first 60 drugs, to find all drugs uncomment the following line and comment the line after that
    # for i in links:
    for i in links[:60]:
        soup = getSoup(url+i)
        script_tag = soup.find_all('script')
        # extracting the JSON object that stores all the relevant information regarding the drug
        for k in script_tag:
            script_content = k.string
            if(script_content and "window.__INITIAL_STATE__" in script_content):
                script_content = script_content.replace(";\n                    window.__STATUS_CODE__ = null;", "")
                script_content = script_content.replace("                    window.__INITIAL_STATE__ = ", "")
                
                # converting the recieved json text into a dictionary for easier access to data
                data = json.loads(script_content)
                #print(data)
                break
        
        # exctracting the relavent information of each drug by accessing the info stored in their respective dictionaries
        
        #name
        try:
            name = data["shellReducer"]["schema"]["drug"]["proprietaryName"].strip() 
            #print(name)
        except Exception as e:
            #name = str(i)[7:str(i).rindex("-")].replace("-", " ").title()
            name = "Unavailable"

        #reference
        ref = str(url) + str(i)

        #packing
        try:
            packing = data["drugPageReducerV2"]["dynamicData"]["priceBox"]["packSizes"].strip()
            #print(packing)
        except Exception as e:
            packing = "Unavailable"

        #presc
        try:
            presc = data["shellReducer"]["schema"]["drug"]["prescriptionStatus"].strip()
            #print(presc)
        except Exception as e:
            presc = "Unavailable"

        #manuf
        try:
            manuf = data["shellReducer"]["schema"]["drug"]["manufacturer"]["legalName"].strip()
            #print(manuf)
        except Exception as e:
            manuf = "Unavailable"

        #salt
        try:
            salt = data["drugPageReducerV2"]["staticData"]["sku"]["summary"]["salt_composition"]["display_text"]
            sp = BeautifulSoup(salt, 'html.parser')
            salt = sp.find_all('a')
            salt = salt[0].get_text().strip()
            #print(salt)
        except Exception as e:
            salt = "Unavailable"

        #storage
        try:
            storage = data["drugPageReducerV2"]["staticData"]["generalInformation"]["attributesData"][2]["value"].strip()
            #print(storage)
        except Exception as e:
            name = "Unavailable"

        #intro
        try:
            intro = data["drugPageReducerV2"]["staticData"]["composition"]["introduction"]["short_introduction"].strip()
            #print(intro)
        except Exception as e:
            intro = "Unavailable"

        #uses
        try:
            uses = data["drugPageReducerV2"]["staticData"]["productUses"]["content"]
            sp = BeautifulSoup(uses, 'html.parser')
            uses = sp.find_all('a')
            uses = uses[0].get_text().strip()
            #print(uses)
        except Exception as e:
            try:
                uses = sp.find_all('li')
                uses = uses[0].get_text().strip()
            except Exception as e1:    
                uses = "Unavailable"

        #benefits
        try:
            benefits = data["drugPageReducerV2"]["staticData"]["productBenefits"]["content"]
            benefits = BeautifulSoup(benefits, 'html.parser')
            br_tag = benefits.find('br')
            div_tag = br_tag.find_parent('div')
            text = div_tag.get_text()
            benefits = text.split(".")[-2].strip()
            #print(benefits)
        except Exception as e:
            benefits = "Unavailable"

        #side
        try:
            side = data["drugPageReducerV2"]["staticData"]["sideEffect"]["content"]
            side = BeautifulSoup(side, 'html.parser').find('ul').find_all('li')
            side = [item.get_text() for item in side]
            side = ', '.join(side)
            #print(side)
        except Exception as e:
            side = "Unavailable"

        #instruction
        try:
            instruction = data["drugPageReducerV2"]["staticData"]["howToUse"]["content"].strip()
            #print(instruction)
        except Exception as e:
            instruction = "Unavailable"

        #function
        try:
            function = data["drugPageReducerV2"]["staticData"]["howWorks"]["content"].strip()
            #print(function)
        except Exception as e:
            function = "Unavailable"

        #alcohol
        try:
            alcohol = data["drugPageReducerV2"]["staticData"]["safetyAdvice"]["warnings"][0]["label"].strip()
            #print(alcohol)
        except Exception as e:
            alcohol = "No interaction found/established"

        #pregnancy
        try:
            pregnancy = data["drugPageReducerV2"]["staticData"]["safetyAdvice"]["warnings"][1]["label"].strip()
            #print(pregnancy)
        except Exception as e:
            pregnancy = "No interaction found/established"

        #bfeed
        try:
            bfeed = data["drugPageReducerV2"]["staticData"]["safetyAdvice"]["warnings"][2]["label"].strip()
            #print(bfeed)
        except Exception as e:
            bfeed = "No interaction found/established"

        #driving
        try:
            driving = data["drugPageReducerV2"]["staticData"]["safetyAdvice"]["warnings"][3]["label"].strip()
            #print(driving)
        except Exception as e:
            driving = "No interaction found/established"

        #kidney
        try:
            kidney = data["drugPageReducerV2"]["staticData"]["safetyAdvice"]["warnings"][4]["label"].strip()
            #print(kidney)
        except Exception as e:
            kidney = "No interaction found/established"

        #liver
        try:
            liver = data["drugPageReducerV2"]["staticData"]["safetyAdvice"]["warnings"][5]["label"].strip()
            #print(liver)
        except Exception as e:
            liver = "No interaction found/established"

        #habitForming
        try:
            habitForming = data["drugPageReducerV2"]["staticData"]["factBoxData"]["attributesData"][0]["value"].strip()
            #print(habitForming)
        except Exception as e:
            habitForming = "Unavailable"

        #therapeuticClass
        try:
            therapeuticClass = data["drugPageReducerV2"]["staticData"]["factBoxData"]["attributesData"][1]["value"].strip()
            #print(therapeuticClass)
        except Exception as e:
            therapeuticClass = "Unavailable"

        #price
        try:
            price = data["drugPageReducerV2"]["dynamicData"]["priceBox"]["priceList"][0]["mrp"]["price"][1:]
            #print(price)
        except Exception as e:
            price = "Unavailable"
            
            
        # creating a dictionary for each drug that stores the relavent information of that drug
        
        rows.append({"name": name, 
                     "reference": ref, 
                     "packing": packing, 
                     "prescription": presc, 
                     "manufacturer": manuf, 
                     "salt": salt, 
                     "storage": storage,
                     "intro": intro,
                     "uses": uses,
                     "benefits": benefits,
                     "sideEffects": side,
                     "instructionForUsage": instruction,
                     "function": function,
                     "alcoholEffects": alcohol,
                     "pregnancyEffects": pregnancy,
                     "breastFeedingEffects": bfeed,
                     "drivingEffects": driving,
                     "kidneyEffects": kidney,
                     "liverEffects": liver,
                     "habitForming": habitForming, 
                     "therapeuticClass": therapeuticClass, 
                     "MRP(Rupees)": price})
    return rows

In [251]:
# running the above function to fetch the rows

rows = getInfo(ls)

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200

In [252]:
# converting the list of rows into a data frame

df = pd.DataFrame(rows)

In [253]:
# displaying the head of the data frame extracted

df.head()

Unnamed: 0,name,reference,packing,prescription,manufacturer,salt,storage,intro,uses,benefits,...,function,alcoholEffects,pregnancyEffects,breastFeedingEffects,drivingEffects,kidneyEffects,liverEffects,habitForming,therapeuticClass,MRP(Rupees)
0,Augmentin 625 Duo Tablet,https://www.1mg.com/drugs/augmentin-625-duo-ta...,10 tablets,prescription,Glaxo SmithKline Pharmaceuticals Ltd,Amoxycillin (500mg) + Clavulanic Acid (125mg),Store below 30°C,Augmentin 625 Duo Tablet is an antibiotic that...,Treatment of Bacterial infections,It usually makes you feel better within a few ...,...,Augmentin 625 Duo Tablet is a combination of t...,SAFE,SAFE IF PRESCRIBED,SAFE IF PRESCRIBED,UNSAFE,CAUTION,CAUTION,No,ANTI INFECTIVES,182.78
1,Aricep 5 Tablet,https://www.1mg.com/drugs/aricep-5-tablet-5606,10 tablets,prescription,Eisai Pharmaceuticals India Pvt Ltd,Donepezil (5mg),Store below 30°C,Aricep 5 Tablet is used to treat mild to moder...,Alzheimer's disease,It can take several weeks for your symptoms to...,...,Memory loss in Alzheimer’s disease occurs due ...,UNSAFE,CONSULT YOUR DOCTOR,CONSULT YOUR DOCTOR,UNSAFE,SAFE IF PRESCRIBED,CAUTION,Carbamate Derivative,No,118.0
2,Avil 25 Tablet,https://www.1mg.com/drugs/avil-25-tablet-69629,15 tablets,prescription,Sanofi India Ltd,Pheniramine (25mg),Store below 30°C,Avil 25 Tablet is an antiallergic medication u...,Treatment of Allergic conditions,If you are taking it to prevent getting sympto...,...,Avil 25 Tablet is an antiallergic medication. ...,UNSAFE,CONSULT YOUR DOCTOR,SAFE IF PRESCRIBED,UNSAFE,CONSULT YOUR DOCTOR,CONSULT YOUR DOCTOR,Pyridines Derivatives,No,10.97
3,Azithral 500 Tablet,https://www.1mg.com/drugs/azithral-500-tablet-...,5 tablets,prescription,Alembic Pharmaceuticals Ltd,Azithromycin (500mg),Store below 30°C,Azithral 500 Tablet is an antibiotic used to t...,Treatment of Bacterial infections,This will make sure that all bacteria are kill...,...,Azithral 500 Tablet is an antibiotic. It works...,UNSAFE,SAFE IF PRESCRIBED,SAFE IF PRESCRIBED,SAFE,CAUTION,CAUTION,Macrolides,No,131.94
4,Aciloc 150 Tablet,https://www.1mg.com/drugs/aciloc-150-tablet-13...,30 tablets,prescription,Cadila Pharmaceuticals Ltd,Ranitidine (150mg),Store below 30°C,Aciloc 150 Tablet is a medicine that reduces t...,Treatment of Gastroesophageal reflux disease (...,Do not eat within 3–4 hours of going to bed,...,Aciloc 150 Tablet is a histamine H2 receptor b...,UNSAFE,SAFE IF PRESCRIBED,SAFE IF PRESCRIBED,SAFE,CAUTION,CAUTION,Aralkylamines Derivative,No,44.85


## .CSV file saved in the following command

In [254]:
# converting the data frame to a csv file which can be accessed in MS Excel

df.to_csv('drugs.csv', index=False)