In [130]:
#for handling data
import numpy as np
import pandas as pd
from math import ceil

#for scraping
import requests
from bs4 import BeautifulSoup
import re
import json
import string

#for concurrency
from queue import Queue, SimpleQueue
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading
import datetime

### Strategy for scraping

- All of the required data is contained in a script tag which begins with the text "window.__INITIAL_STATE__"

- First I have extracted all script tags from the soup

- Then I have extracted the required script tag (by checking if it begins with the "window.__INITIAL_STATE__" text)

- Then I perform a bit of cleaning before loading the required data as json

Note - Each of the script tags only contain 20 of the items even though we see 30 items per page. But the remaining items are available by making requests to the next page 

### Logic 
For each label keep making requests for each page till you don't get a status 200 code (this does not seem to work, the pages which do not have any data also get a success status code)



In [131]:
def extract_json(soup, send_pages=False):
    all_scripts=soup.find_all("script")
    
    #extracting the script tag as a string (storing in a list)
    all_med_data=[]
    for script in all_scripts:
        # to check if a script tag can be converted to a string and if the string has the "window.__INITIAL__STATE__"
        if (script.string) and ("window.__INITIAL_STATE__" in script.string):
            all_med_data.append(script.string)
    json_data=all_med_data[0].replace("\n    window.__INITIAL_STATE__ = ",""
                       ).replace(";\n    window.__STATUS_CODE__ = null;\n","")
    json_data=json.loads(json_data)

    if send_pages:
        return ceil(json_data['allMedicinePageReducer']['meta']['total_count']/json_data['allMedicinePageReducer']['meta']['count'])
    
    return(json_data['allMedicinePageReducer']['data'])

%%time
base_url="https://www.1mg.com/drugs-all-medicines"
label=list(string.ascii_lowercase)
page=list(range(1,1000))
results=[]
for i in page:
    for j in ['a','b']:
        response=requests.get(f'{base_url}?page={str(i)}&label={str(j)}')
        soup=BeautifulSoup(response.content, "html.parser")
        if (response.status_code==200) and (extract_json(soup)['skus']):
            results.append(extract_json(BeautifulSoup(response.content, "html.parser")))
        else:
            break;

### Trying to implement the Producer/Consumer model for Threading using concurrent.futures

In [132]:
tasks=Queue()
results=SimpleQueue()

In [133]:
resp = requests.get("https://www.1mg.com/drugs-all-medicines?page=1&label=b")
soup = BeautifulSoup(resp.content, "html.parser")
pages = extract_json(soup, send_pages=True)

In [134]:
#creating tasks
base_url="https://www.1mg.com/drugs-all-medicines"
labels=list(string.ascii_lowercase)
session = requests.session()
# page=list(range(1,2000))
for alpha in labels:
    resp = session.get(f'{base_url}?page=1&label={alpha}')
    soup = BeautifulSoup(resp.content, "html.parser")
    pages = extract_json(soup, send_pages=True)
    print(f'{alpha} {pages}')
    for page in range(1, pages+1):
        tasks.put(f'{base_url}?page={page}&label={alpha}')

y 32


In [135]:
tasks.qsize()

32

In [136]:
thread_local = threading.local()

def get_session():
    if not hasattr(thread_local, 'session'):
        thread_local.session = requests.Session()
    return thread_local.session

In [137]:
# while tasks.qsize() != 0:
#     task = tasks.get(block=False)
#     response = session.get(task)
#     soup = BeautifulSoup(response.content, "html.parser")
#     if (response.status_code==200) and (extract_json(soup)['skus']):
#         results.put(extract_json(BeautifulSoup(response.content, "html.parser")))
#         print(f"result stored for task {task}") 
#     else:
#         print(f'data not available for task: {task}')
    
def scraping_worker(tasks,results):
    session = get_session()
    while True:
        try:
            task = tasks.get(block=False)
        except queue.Empty: #
            print('Queue is empty! My work here is done. Exiting.')
            return
        tasks.task_done()
        response=session.get(task)
        soup=BeautifulSoup(response.content, "html.parser")
        if (response.status_code==200) and (extract_json(soup)['skus']):
            results.put(extract_json(BeautifulSoup(response.content, "html.parser")))
            print(f"result stored for task {task}") 
        else:
            print(f'data not available for task: {task}')
        

result stored for task https://www.1mg.com/drugs-all-medicines?page=1&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=2&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=3&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=4&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=5&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=6&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=7&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=8&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=9&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=10&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=11&label=y
result stored for task https://www.1mg.com/drugs-all-medicines?page=12&label=y
result stored for task https://www.1mg.com/drugs-all-medicine

In [138]:
%%time
with ThreadPoolExecutor(max_workers=4) as ex:
    futures = [
        ex.submit(scraping_worker, tasks, results) for _ in range(4)
    ]

In [139]:
# all([f.done() for f in futures])

In [140]:
tasks.qsize()

0

In [141]:
results.qsize()

32

### Stuff to record for each card - 

1. Id and Skuid (for future fetching of data)

2. Name

3. Prescription req
    
4. is_discontinued

5. manufacturer

6. Type

7. Pack size

8. short_composition

9. rx_required.header

In [142]:
temp_df = []
for i in range(0,results.qsize()):
    result=results.get()
    temp_df.append(pd.DataFrame([[i.get('sku_id'),
                   i.get('name'),
                   i.get('manufacturer_name'),
                   i.get('type'),
                   i.get('pack_size_label'),
                   i.get('price'),
                   i.get('rx_required'),
                   i.get('short_composition'),
                   i.get('is_discontinued')] for i in result['skus']],
                 columns=['sku_id',
                 'name','manufacturer_name',
                 'type','pack_size_label',
                 'price','rx_required',
                 'short_composition','is_discontinued']))
final_df=pd.concat(temp_df,axis='rows',ignore_index=True)
    

In [143]:
final_df['sku_id'].sort_values()

25      30956
0       67264
4      126903
1      145603
7      146783
        ...  
195    704697
194    708323
106    712214
312    719084
212    720391
Name: sku_id, Length: 636, dtype: int64

In [144]:
final_df.to_csv(f'drugs{datetime.datetime.now().strftime("%m-%d-%YT%H-%M")}.csv',index=False)