In [3]:
import re

def download(url,user_agent='naruto',number_retries = 2, charset='utf-8'):
    print("Downloading:", url)
    request = urllib.request.Request(url)
    request.add_header("User-agent", user_agent)
    try:
        resp = urllib.request.urlopen(request)
        #verify if web charset is utf-8, if not ,we'll use utf-8 to decode, haha, 
        #it will throw error if no cs returned, but
        #we hope utf-8 will help
        #read more on pypi.python.org/pypi/chardet to implement a more robust decoder
        cs = resp.headers.get_content_charset()
        if not cs:
            cs = charset
        html = resp.read().decode(cs)
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error:', e.reason)
        html = None
        if number_retries > 0:
            if hasattr(e, "code") and 500 <= e.code <= 600:
                #recursive download process
                return download(url,number_retries -1)
    return html

def link_crawler(start_url, link_regex):
    """
    we'll start from a base url and if the link match the link_regex,
    we'll continue the crawler process
    """
    crawl_queue = [start_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        if not html:
            continue
        #the following code is to get the type of links we want
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_queue.append(link)
# now, let's define the get_links function

def get_links(html):
    """
    return a list of links from our start url
    """
    # use the regex 
    webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
    # now, let's get all the links
    return webpage_regex.findall(html)

The problem with the above link is that it only includes the path ,and leave out the protocal and the server,which is known as ***relative link***. It works in browsing, not in urllib. Here, let's find the ***absolute link***

in urllib parse is the thing we want for absolute link

In [4]:
from urllib.parse import urljoin

def link_crawler(start_url, link_regex):
    
    """
    we'll start from a base url and if the link match the link_regex,
    we'll continue the crawler process
    """
    
    crawl_queue = [start_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        if not html:
            continue
        #the following code is to get the type of links we want
        for link in get_links(html):
            if re.match(link_regex, link):
                abs_link = urljoin(start_url, link)
                crawl_queue.append(abs_link)

the problem with the above method is the circular crawling

let's sovle the circular crawling problem

In [5]:
def link_crawler(start_url, link_regex):
    
    """
    we'll start from a base url and if the link match the link_regex,
    we'll continue the crawler process
    """
    
    crawl_queue = [start_url]
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        if not html:
            continue
        #the following code is to get the type of links we want
        for link in get_links(html):
            if re.match(link_regex, link):
                abs_link = urljoin(start_url, link)
                #let's check whether link is in crawl_queue
                if abs_link not in seen:
                    seen.add(abs_link)
                    crawl_queue.append(abs_link)
                

wolaaaa ,now a link crawler is completed, enjoy the coffee