In [54]:
#for handling data
import numpy as np
import pandas as pd
from math import ceil

#for scraping
import requests
from bs4 import BeautifulSoup
import re
import json
import string

#for concurrency
from queue import Queue, SimpleQueue
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading

### Strategy for scraping

- All of the required data is contained in a script tag which begins with the text "window.__INITIAL_STATE__"

- First I have extracted all script tags from the soup

- Then I have extracted the required script tag (by checking if it begins with the "window.__INITIAL_STATE__" text)

- Then I perform a bit of cleaning before loading the required data as json

Note - Each of the script tags only contain 20 of the items even though we see 30 items per page. But the remaining items are available by making requests to the next page 

### Logic 
For each label keep making requests for each page till you don't get a status 200 code (this does not seem to work, the pages which do not have any data also get a success status code)



In [72]:
def extract_json(soup, send_pages=False):
    all_scripts=soup.find_all("script")
    
    #extracting the script tag as a string (storing in a list)
    all_med_data=[]
    for script in all_scripts:
        # to check if a script tag can be converted to a string and if the string has the "window.__INITIAL__STATE__"
        if (script.string) and ("window.__INITIAL_STATE__" in script.string):
            all_med_data.append(script.string)
    json_data=all_med_data[0].replace("\n    window.__INITIAL_STATE__ = ",""
                       ).replace(";\n    window.__STATUS_CODE__ = null;\n","")
    json_data=json.loads(json_data)

    if send_pages:
        return ceil(json_data['allMedicinePageReducer']['meta']['total_count']/20)
    
    return(json_data['allMedicinePageReducer']['data'])

%%time
base_url="https://www.1mg.com/drugs-all-medicines"
label=list(string.ascii_lowercase)
page=list(range(1,1000))
results=[]
for i in page:
    for j in ['a','b']:
        response=requests.get(f'{base_url}?page={str(i)}&label={str(j)}')
        soup=BeautifulSoup(response.content, "html.parser")
        if (response.status_code==200) and (extract_json(soup)['skus']):
            results.append(extract_json(BeautifulSoup(response.content, "html.parser")))
        else:
            break;

### Trying to implement the Producer/Consumer model for Threading using concurrent.futures

In [73]:
tasks=Queue()
results=SimpleQueue()

In [59]:
resp = requests.get("https://www.1mg.com/drugs-all-medicines?page=1&label=b")
soup = BeautifulSoup(resp.content, "html.parser")
pages = extract_json(soup, send_pages=True)

In [69]:
print(pages['allMedicinePageReducer']['meta']['total_count'])

7792


In [75]:
#creating tasks
base_url="https://www.1mg.com/drugs-all-medicines"
labels=list(string.ascii_lowercase)
session = requests.session()
# page=list(range(1,2000))
for alpha in labels:
    resp = session.get(f'{base_url}?page=1&label={alpha}')
    soup = BeautifulSoup(resp.content, "html.parser")
    pages = extract_json(soup, send_pages=True)
    print(f'{alpha} {pages}')
    for page in range(1, pages+1):
        tasks.put(f'{base_url}?page={page}&label={alpha}')

a 1372
b 390
c 1269
d 741
e 574
f 492
g 459
h 187
i 295
j 83
k 247
l 609
m 856
n 571
o 612
p 823
q 59
r 786
s 758
t 775
u 138
v 400
w 113
x 95
y 32
z 370


In [78]:
tasks.qsize()

13106

In [79]:
thread_local = threading.local()

def get_session():
    if not hasattr(thread_local, 'session'):
        thread_local.session = requests.Session()
    return thread_local.session

In [80]:

def scraping_worker(tasks,results):
    j=0
    session = get_session()
    while True:
        try:
            task = tasks.get(block=False) 
        except queue.Empty: #
            print('Queue is empty! My work here is done. Exiting.')
            return
        tasks.task_done()
        response=session.get(task)
        soup=BeautifulSoup(response.content, "html.parser")
        if (response.status_code==200) and (extract_json(soup)['skus']):
            results.put(extract_json(BeautifulSoup(response.content, "html.parser")))
            j+=1
            print(f"result {j} stored")
        

In [81]:
%%time
with ThreadPoolExecutor(max_workers=15) as ex:
    futures = [
        ex.submit(scraping_worker, tasks, results) for _ in range(15)
    ]

result 1 stored
result 1 stored
result 1 stored
result 1 stored
result 1 stored
result 1 stored
result 1 stored
result 1 stored
result 1 stored
result 1 storedresult 1 stored
result 1 stored

result 1 stored
result 1 stored
result 2 storedresult 2 stored

result 1 stored
result 2 stored
result 2 stored
result 2 storedresult 2 stored

result 2 stored
result 2 storedresult 2 stored

result 2 stored
result 2 stored
result 2 stored
result 2 stored
result 2 stored
result 3 stored
result 3 stored
result 3 stored
result 3 stored
result 3 stored
result 3 stored
result 3 stored
result 3 storedresult 2 storedresult 3 stored

result 3 stored

result 3 stored
result 3 stored
result 3 stored
result 4 storedresult 4 stored
result 4 stored

result 4 stored
result 4 storedresult 4 stored

result 4 stored
result 4 stored
result 4 storedresult 4 stored
result 3 stored

result 4 stored
result 4 stored
result 4 stored
result 3 stored
result 5 stored
result 5 stored
result 5 stored
result 5 stored
result 5

In [82]:
all([f.done() for f in futures])

True

In [83]:
tasks.qsize()

0

In [84]:
results.qsize()

13106

### Stuff to record for each card - 

1. Id and Skuid (for future fetching of data)

2. Name

3. Prescription req
    
4. is_discontinued

5. manufacturer

6. Type

7. Pack size

8. short_composition

9. rx_required.header

In [85]:
final_df=pd.DataFrame()
for i in range(0,results.qsize()):
    result=results.get()
    temp_df=pd.DataFrame([[i.get('sku_id'),
                   i.get('name'),
                   i.get('manufacturer_name'),
                   i.get('type'),
                   i.get('pack_size_label'),
                   i.get('price'),
                   i.get('rx_required'),
                   i.get('short_composition'),
                   i.get('is_discontinued')] for i in result['skus']],
                 columns=['sku_id',
                 'name','manufacturer_name',
                 'type','pack_size_label',
                 'price','rx_required',
                 'short_composition','is_discontinued'])
    final_df=pd.concat([final_df,temp_df],axis='rows',ignore_index=True)
    

In [88]:
final_df['sku_id'].sort_values()

36620       4923
35200       4960
193         4967
152135      4974
75460       4977
           ...  
27221     723690
27300     723751
122064    723815
162614    723902
96476     724209
Name: sku_id, Length: 261869, dtype: int64

In [86]:
final_df.to_csv("drugs.csv",index=False)