In [323]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [324]:
import urllib
import re
import json
import time
from collections import deque
from bs4 import BeautifulSoup

In [325]:
def print_html_all(url:str):
    '''
    Prints all the text from a page with a given url.
    '''
    with urllib.request.urlopen(url) as f:
        html_doc = f.read()
    soup = BeautifulSoup(html_doc, 'html.parser')
    print(soup.prettify())

In [None]:
def get_html_components(url:str) -> dict:
    '''
    Extracts all the components needed (title, description and links) from a page with a given url.
    '''
    with urllib.request.urlopen(url) as f:
        html_doc = f.read()
    soup = BeautifulSoup(html_doc, 'html.parser')

    title = soup.title
    if title is not None:
        title = title.string
    else:
        title = ""

    description = soup.find('p', class_='product-description')
    if description is not None:
        description = description.get_text()
    else:
        description = ""

    links = []
    all_links = soup.find_all(href=re.compile(r'^http')) # To only keep real links
    if all_links is not None:
        for link in all_links:
            links.append(link.get('href'))

    output = {
        'url': url,
        'title': title,
        'description': description,
        'links': links,
    }
    return output

In [327]:
def get_robots_url(url:str) -> str:
    '''
    Gives the robots.txt's url from a page with a given url.
    '''
    o = urllib.parse.urlparse(url)
    url_robot = f"{o.scheme}://{o.netloc}/robots.txt"
    return url_robot

In [328]:
def can_beparsed(url:str, rp:urllib.robotparser.RobotFileParser, useragent:str='*',) -> bool:
    '''
    Returns True if a page with a given url can be parsed by the crawler, False otherwise.
    '''
    return rp.can_fetch(useragent, url)

In [None]:
def update_queue(queue:deque, visited:set, new_urls:list, init:bool, rp:urllib.robotparser.RobotFileParser) -> deque:
    '''
    Gives the updated queue and visited set.
    '''
    if not init: 
        old = queue.popleft() 
        visited.add(old)
    for url in new_urls:
        if can_beparsed(url, rp) and not url in visited: # We check if the page can be parsed and if it has not already been
            o = urllib.parse.urlparse(url)
            if "product" in o.path: # To priorize pages with 'product' token
                queue.appendleft(url)
            else:
                queue.append(url)
    return queue, visited 
        

In [None]:
def crawler(url:str, n_max:int=50) -> list:
    '''
    Crawls a number of pages starting from a given url.
    '''
    i = 0
    data = []
    queue = deque([url])
    visited = set()

    robot_url = get_robots_url(url)
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(robot_url)
    rp.read()

    while i < n_max: # We only want to extract n_max pages
        current_url = queue.popleft() # We remove the first url from the queue
        if current_url in visited: # If the page has already been crawled, we skip it
            continue

        new = get_html_components(current_url) # We get the data we need from the page
        data.append(new)
        init = i == 0 
        queue, visited = update_queue(queue, visited, new['links'], init, rp)
        time.sleep(0.5) # To avoid overload
        i += 1
    
    return data
    

In [331]:
url = "https://web-scraping.dev/products"
n_max = 50
data = crawler(url, n_max)
data = sorted(data, key=lambda x: x['url'])

with open('../output/products.json', 'w') as f:
    json.dump(data, f, indent=4)