In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json
import string

In [2]:
label=list(string.ascii_lowercase)
page=list(range(1,500))
URLS=[]
for i in label:
    for j in page:
        URLS.append("https://www.1mg.com/drugs-all-medicines?page="+str(j)+"&label="+i)

In [3]:
len(URLS)

1039974

### Strategy for scraping

- All of the required data is contained in a script tag which begins with the text "window.__INITIAL_STATE__"

- First I have extracted all script tags from the soup

- Then I have extracted the required script tag (by checking if it begins with the "window.__INITIAL_STATE__" text)

- Then I perform a bit of cleaning before loading the required data as json

Note - Each of the script tags only contain 20 of the items even though we see 30 items per page. But the remaining items are available by making requests to the next page 

In [4]:
def extract_json(soup):
    all_scripts=soup.find_all("script")
    
    #extracting the script tag as a string (storing in a list)
    all_med_data=[]
    for script in all_scripts:
        # to check if a script tag can be converted to a string and if the string has the "window.__INITIAL__STATE__"
        if (script.string) and ("window.__INITIAL_STATE__" in script.string):
            all_med_data.append(script.string)
    json_data=all_med_data[0].replace("\n    window.__INITIAL_STATE__ = ",""
                       ).replace(";\n    window.__STATUS_CODE__ = null;\n","")
    json_data=json.loads(json_data)
    
    return(json_data['allMedicinePageReducer']['data'])

In [45]:
%%time
soup=[]
for i in URLS[1:2]:
    try:
        page_req=requests.get(i)
        soup.append(BeautifulSoup(page_req.content, "html.parser"))
        print("present")
    except:
        print(f"URL not present{i}")

present
Wall time: 298 ms


In [49]:
extract_json(soup)

{'skus': [{'is_discontinued': False,
   'manufacturer_name': 'Pfizer Ltd',
   'type': 'allopathy',
   'price': 82.99,
   'name': 'Ativan 2mg Tablet',
   'id': 236513,
   'sku_id': 236513,
   'available': False,
   'pack_size_label': 'strip of 30 tablets',
   'rx_required': {'header': 'Prescription Required',
    'icon_url': 'https://onemg.gumlet.io/image/upload/w_20,h_20/q_auto,f_auto/rx_icon.png'},
   'slug': '/drugs/ativan-2mg-tablet-236513',
   'short_composition': 'Lorazepam (2mg)',
   'image_url': 'https://onemg.gumlet.io/image/upload/a_ignore,w_380,h_380,c_fit,q_auto,f_auto/v1625228213/hx2gxivwmeoxxxsc1hix.png',
   'in_stock': None,
   'quantity': 30},
  {'is_discontinued': False,
   'manufacturer_name': 'Glenmark Pharmaceuticals Ltd',
   'type': 'allopathy',
   'price': 118,
   'name': 'Ascoril D Plus Syrup Sugar Free',
   'id': 184611,
   'sku_id': 184611,
   'available': True,
   'pack_size_label': 'bottle of 100 ml Syrup',
   'rx_required': {'header': 'Prescription Required',

### Stuff to record for each card - 

1. Id and Skuid (for future fetching of data)

2. Name

3. Prescription req
    
4. is_discontinued

5. manufacturer

6. Type

7. Pack size

8. short_composition

9. rx_required.header

In [47]:
pd.DataFrame({str(i['id']):(str(i['sku_id']),i['name'],i['manufacturer_name'],
  i['type'],i['pack_size_label'],str(i['price']),
  i.get('rx_required'),
  i['short_composition'],str(i['is_discontinued'])) for i in extract_json(soup)['skus']}).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
236513,236513,Ativan 2mg Tablet,Pfizer Ltd,allopathy,strip of 30 tablets,82.99,"{'header': 'Prescription Required', 'icon_url'...",Lorazepam (2mg),False
184611,184611,Ascoril D Plus Syrup Sugar Free,Glenmark Pharmaceuticals Ltd,allopathy,bottle of 100 ml Syrup,118.0,"{'header': 'Prescription Required', 'icon_url'...",Phenylephrine (5mg) + Chlorpheniramine Maleate...,False
68773,68773,Alprax 0.5mg Tablet SR,Torrent Pharmaceuticals Ltd,allopathy,strip of 15 tablet sr,60.85,"{'header': 'Prescription Required', 'icon_url'...",Alprazolam (0.5mg),False
322827,322827,Ativan 1mg Tablet,Pfizer Ltd,allopathy,strip of 30 tablets,68.36,"{'header': 'Prescription Required', 'icon_url'...",Lorazepam (1mg),False
228392,228392,Atarax 10mg Tablet,Dr Reddy's Laboratories Ltd,allopathy,strip of 15 tablets,43.56,"{'header': 'Prescription Required', 'icon_url'...",Hydroxyzine (10mg),False
63221,63221,Altraday Capsule SR,Sun Pharmaceutical Industries Ltd,allopathy,strip of 10 capsule sr,120.0,"{'header': 'Prescription Required', 'icon_url'...",Aceclofenac (200mg) + Rabeprazole (20mg),False
116839,116839,Aldactone Tablet,RPG Life Sciences Ltd,allopathy,strip of 15 tablets,31.95,"{'header': 'Prescription Required', 'icon_url'...",Spironolactone (25mg),False
141944,141944,Asthalin 100mcg Inhaler,Cipla Ltd,allopathy,packet of 200 MDI Inhaler,142.51,"{'header': 'Prescription Required', 'icon_url'...",Salbutamol (100mcg),False
114550,114550,Allegra Suspension Raspberry & Vanilla,Sanofi India Ltd,allopathy,bottle of 100 ml Oral Suspension,188.78,"{'header': 'Prescription Required', 'icon_url'...",Fexofenadine (30mg/5ml),False
324040,324040,Axcer 90mg Tablet,Sun Pharmaceutical Industries Ltd,allopathy,strip of 14 tablets,420.0,"{'header': 'Prescription Required', 'icon_url'...",Ticagrelor (90mg),False


In [40]:
extract_json(soup)['skus'][0].get('rx_required')['header']

'Prescription Required'