<a href="https://colab.research.google.com/github/sughoshdeshpande7/Parallel-Web-Scrapper/blob/main/web_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

def get_links():
    curr_list = 'https://en.wikipedia.org/wiki/List_of_circulating_currencies'
    response = requests.get(curr_list)
    soup = BeautifulSoup(response.text, "lxml")  # Using (the complete html string, lxml parser) for data
    selector = 'p+table td:nth-child(2) > a, p+table td:nth-child(1) > a:nth-child(1)'    # method to take CSS Selector
    all_link_el=soup.select(selector)  
    links=[]
    for link_el in all_link_el:
        link = link_el.get("href")
        link = urljoin(curr_list, link)  # convert relative link to absolute link
        links.append(link)  # Appends all the links to the list
    return links

def fetch(link):
    response = requests.get(link)  # network delay (Takes one link at a time) -  Initiates complete connection to web server everytime on function call
    filename= "./output/"+link.split("/")[-1]+".html"     # disk i/o - Takes long disk writing time
    with open(filename,'wb') as f:
        f.write(response.content)      # Dumps all content in bytes to a local folder
    
    print('...',end='')

if __name__ == '__main__':
    links = get_links()   
    start_time = time.time()
  
    for link in links:
      fetch(link)   # Takes a long time to fetch 260+ links. Hence, has to be optimized
    
    print('\n\tTotal Time Taken:',time.time()-start_time, 'seconds')
    


	Total Time Taken: 2.6226043701171875e-06 seconds


###**Use Sessions If you are processing a lot of pages from the same website**

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

def get_links():
    curr_list = 'https://en.wikipedia.org/wiki/List_of_circulating_currencies'
    response = requests.get(curr_list)
    soup = BeautifulSoup(response.text, "lxml")
    selector = 'p+table td:nth-child(2) > a, p+table td:nth-child(1) > a:nth-child(1)'
    all_link_el=soup.select(selector)
    links=[]
    for link_el in all_link_el:
        link = link_el.get("href")
        link = urljoin(curr_list, link)
        links.append(link)
    return links

def fetch(link,s):
    response = s.get(link)   # reusing sessions instead of 
    filename= "./output/"+link.split("/")[-1]+".html"     #disk i/o
    with open(filename,'wb') as f:
        f.write(response.content)
    
    print('...',end='',flush=True)

if __name__ == '__main__':
    links = get_links()
    start_time = time.time()
  
    s= requests.Session()
    
    print('\n\tTotal Time Taken:',time.time()-start_time, 'seconds')
    


	Total Time Taken: 0.00014591217041015625 seconds


In [None]:
from multiprocessing import cpu_count

In [None]:
print('You Have',cpu_count(),'core CPUs!')  # Therefore we can create 2 different processes. 

You Have 2 core CPUs!


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from multiprocessing import cpu_count, Pool

def get_links():
    curr_list = 'https://en.wikipedia.org/wiki/List_of_circulating_currencies'
    response = requests.get(curr_list)
    soup = BeautifulSoup(response.text, "lxml")
    selector = 'p+table td:nth-child(2) > a, p+table td:nth-child(1) > a:nth-child(1)'
    all_link_el=soup.select(selector)
    links=[]
    for link_el in all_link_el:
        link = link_el.get("href")
        link = urljoin(curr_list, link)
        links.append(link)
    return links

def fetch(link,s):
    response = s.get(link)     #Does Not initiate new connection to web server everytime on func.call
                               #RE use same session
    filename= "./output/"+link.split("/")[-1]+".html"     #disk i/o
    with open(filename,'wb') as f:
        f.write(response.content)
    
    print('...',end='',flush=True)

if __name__ == '__main__':
    links = get_links()
    start_time = time.time()
  
    with Pool(cpu_count()) as p:  
      p.map(fetch,links)
    
    print('\n\tTotal Time Taken:',time.time()-start_time, 'seconds')
    


	Total Time Taken: 0.039621829986572266 seconds


To Further Reduce time we create Multiple Threads where processes 
switch from 
one thread to another with no limit on the number of threads that can be used

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from concurrent.futures import ThreadPoolExecutor

def get_links():
    curr_list = 'https://en.wikipedia.org/wiki/List_of_circulating_currencies'
    response = requests.get(curr_list)
    soup = BeautifulSoup(response.text, "lxml")
    selector = 'p+table td:nth-child(2) > a, p+table td:nth-child(1) > a:nth-child(1)'
    all_link_el=soup.select(selector)
    links=[]
    for link_el in all_link_el:
        link = link_el.get("href")
        link = urljoin(curr_list, link)
        links.append(link)
    return links

def fetch(link,s):
    response = s.get(link)     #RE use same session
    filename= "./output/"+link.split("/")[-1]+".html"     #disk i/o
    with open(filename,'wb') as f:
        f.write(response.content)
    
    print('...',end='',flush=True)

if __name__ == '__main__':
    links = get_links()
    start_time = time.time()
  
    with ThreadPoolExecutor(max_workers=200) as p: # Not bound by number of core CPUs
      p.map(fetch,links)
    
    print('\n\tTotal Time Taken:',time.time()-start_time, 'seconds')


	Total Time Taken: 9.131431579589844e-05 seconds
