In [76]:
#for handling data
import numpy as np
import pandas as pd
import math

#for scraping
import requests
from bs4 import BeautifulSoup
import re
import json
import string

#for concurrency
from queue import Queue, SimpleQueue
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading

import time
import copy

### Strategy for scraping

- All of the required data is contained in a script tag which begins with the text "window.__INITIAL_STATE__"

- First I have extracted all script tags from the soup

- Then I have extracted the required script tag (by checking if it begins with the "window.__INITIAL_STATE__" text)

- Then I perform a bit of cleaning before loading the required data as json

Note - Each of the script tags only contain 20 of the items even though we see 30 items per page. But the remaining items are available by making requests to the next page 

### Logic 
For each label keep making requests for each page till you don't get a status 200 code (this does not seem to work, the pages which do not have any data also get a success status code)



In [77]:
def extract_json(response):
    json_data=json.loads(response.content)
    return(json_data['data'])

%%time
base_url="https://www.1mg.com/drugs-all-medicines"
label=list(string.ascii_lowercase)
page=list(range(1,1000))
results=[]
for i in page:
    for j in ['a','b']:
        response=requests.get(f'{base_url}?page={str(i)}&label={str(j)}')
        soup=BeautifulSoup(response.content, "html.parser")
        if (response.status_code==200) and (extract_json(soup)['skus']):
            results.append(extract_json(BeautifulSoup(response.content, "html.parser")))
        else:
            break;

### Trying to implement the Producer/Consumer model for Threading using concurrent.futures

In [78]:
tasks=Queue()
results=SimpleQueue()

In [79]:
base_url = "https://www.1mg.com/pharmacy_api_gateway/v4/drug_skus/by_prefix"
labels = list(string.ascii_lowercase)
session = requests.Session()

In [80]:
# get number of skus for the given alphabet
def get_number_skus(alphabet):
    resp = session.get(f'{base_url}?prefix_term={alphabet}&page=1&per_page=1')
    total_count = json.loads(resp.content)['meta']['total_count']
    return total_count

In [81]:
# get number of pages to traverse for total sku count
def get_number_pages(total_count, results_per_request):
    return math.ceil(total_count/results_per_request)

In [82]:
# creating tasks
results_per_request = 50 # can be 1-50
for alpha in labels:
    pages = get_number_pages(get_number_skus(alpha), results_per_request)
    for page in range(1, pages+1):
        tasks.put(f'{base_url}?prefix_term={alpha}&page={page}&per_page={results_per_request}')
    time.sleep(5)
    

In [83]:
thread_local = threading.local()

def get_session():
    if not hasattr(thread_local, 'session'):
        thread_local.session = requests.Session()
    return thread_local.session

In [100]:
global j
j=0
def scraping_worker(tasks,results):
    
    session = get_session()
    while True:
        try:
            task = tasks.get(block=False) 
        except: #
            print('Queue is empty! My work here is done. Exiting.')
            return
        tasks.task_done()
        response=session.get(task)
        if (response.status_code==200) and (extract_json(response)['skus']):
            results.put(extract_json(response))
            j+=1
            print(f"result {j} stored")
        time.sleep(5)
        

In [101]:
%%time
with ThreadPoolExecutor(max_workers=6) as ex:
    futures = [
        ex.submit(scraping_worker, tasks, results) for _ in range(6)
    ]

CPU times: user 186 ms, sys: 12 ms, total: 198 ms
Wall time: 933 ms


In [88]:
all([f.done() for f in futures])

True

In [94]:
tasks.qsize()

5176

In [95]:
results.qsize()

44

### Stuff to record for each card - 

1. Id and Skuid (for future fetching of data)

2. Name

3. Prescription req
    
4. is_discontinued

5. manufacturer

6. Type

7. Pack size

8. short_composition

9. rx_required.header

In [None]:
temp_df = []
for i in range(0,results.qsize()):
    result=results.get()
    temp_df.append(pd.DataFrame([[i.get('sku_id'),
                   i.get('name'),
                   i.get('manufacturer_name'),
                   i.get('type'),
                   i.get('pack_size_label'),
                   i.get('price'),
                   i.get('rx_required'),
                   i.get('short_composition'),
                   i.get('is_discontinued')] for i in result['skus']],
                 columns=['sku_id',
                 'name','manufacturer_name',
                 'type','pack_size_label',
                 'price','rx_required',
                 'short_composition','is_discontinued']))
final_df=pd.concat(temp_df, axis='rows',ignore_index=True)
    

In [None]:
final_df['name'].sort_values()

3312                   A 1 5mg Tablet
5929                 A 250 Suspension
10016        A 3 100 mg/500 mg Tablet
10989           A Arti 60mg Injection
10035      A Arti L 80mg/480mg Tablet
                     ...             
46685               Hyzer 10mg Tablet
46618              Hyzine 25mg Tablet
46873                Hyzix 10mg Syrup
47172    Hyzol D 30mg/40mg Capsule SR
47210            Hyzolid 600mg Tablet
Name: name, Length: 47216, dtype: object

In [None]:
final_df.to_csv("drugs.csv",index=False)