In [1]:
import urllib.request
import re
from urllib.error import URLError, HTTPError, ContentTooShortError



In [27]:
def download(url, user_agent='wswp', num_retries=4, charset='utf-8', proxy=None):
    """ Download a given URL and return the page content
        args:
            url (str): URL
        kwargs:
            user_agent (str): user agent (default: wswp)
            charset (str): charset if website does not include one in headers
            proxy (str): proxy url, ex 'http://IP' (default: None)
            num_retries (int): number of retries if a 5xx error is seen (default: 2)
    """
    print('Downloading:', url)
    request = urllib.request.Request(url)
    request.add_header('User-agent', user_agent)
    try:
        if proxy:
            proxy_support = urllib.request.ProxyHandler({'http': proxy})
            opener = urllib.request.build_opener(proxy_support)
            urllib.request.install_opener(opener)
        resp = urllib.request.urlopen(request)
        cs = resp.headers.get_content_charset()
        if not cs:
            cs = charset
        html = resp.read().decode(cs)
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # recursively retry 5xx HTTP errors
                return download(url, num_retries - 1)
    return html

import itertools 
from bs4 import BeautifulSoup as bs

def crawl_site(url):
    titles=[]
    for page in itertools.count(1): 
        pg_url = '{}{}{}'.format(url, page,'/?a=a&l=3&y=') 
        html = download(pg_url) 
        if html is None: 
            break 
        soup = bs(html,'html5lib')
        lik = soup.find_all('td',attrs={'class':'title'})
        tittle = [(t.text) for t in lik]
        titles.append(tittle)
        if page > 9:
            break
    return titles


In [28]:
all_tittles = crawl_site('https://www.poemhunter.com/percy-bysshe-shelley/poems/page-')

Downloading: https://www.poemhunter.com/percy-bysshe-shelley/poems/page-1/?a=a&l=3&y=
Downloading: https://www.poemhunter.com/percy-bysshe-shelley/poems/page-2/?a=a&l=3&y=
Downloading: https://www.poemhunter.com/percy-bysshe-shelley/poems/page-3/?a=a&l=3&y=


In [29]:
t = [x for tit in all_tittles for x in tit]
t=set(t)
t=list(t)
t = [x[11:-10] for x in t]


In [31]:
t[1:10]

['To Death',
 'Lines Written During The Castlereagh Administration',
 'I Stood Upon A Heaven-Cleaving Turret',
 'Epipsychidion: Passages Of The Poem, Or Connected Therewith',
 'To Italy',
 'When A Lover Clasps His Fairest',
 'The Death Knell Is Ringing',
 'To The Republicans Of North America',
 'Fragment: To A Friend Released From Prison']

In [34]:
import csv
with open('data/peom.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['title'])
    for i in t:
        spamwriter.writerow([i])

    
    

In [145]:
def get_links(html):
    """return a list of links from html 
    """
    webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""",re.IGNORECASE)
    return webpage_regex.findall(html)


In [147]:
get_links(download("http://www.ftchinese.com/channel/finance.html?page=2"))

Downloading: http://www.ftchinese.com/channel/finance.html?page=2


['http://user.ftchinese.com/register',
 '/users/mystories',
 '/users/cp',
 'http://user.ftchinese.com/logout',
 '/users/findpassword',
 'http://user.ftchinese.com/register',
 '/',
 'http://big5.ftchinese.com/',
 'http://www.ft.com/',
 '/',
 '/channel/china.html',
 '/channel/world.html',
 '/channel/economy.html',
 '/channel/markets.html',
 '/channel/business.html',
 '/channel/business.html',
 '/channel/finance.html',
 '/channel/technology.html',
 '/channel/auto.html',
 '/channel/property.html',
 '/channel/agriculture.html',
 '/channel/energy.html',
 '/channel/industrials.html',
 '/channel/airline.html',
 '/channel/pharma.html',
 '/channel/entertainment.html',
 '/channel/consumer.html',
 '/channel/media.html',
 '/channel/innovation.html',
 '/channel/opinion.html',
 '/channel/management.html',
 '/channel/lifestyle.html',
 '/channel/stream.html',
 '/channel/english.html',
 'http://user.ftchinese.com/login',
 'http://user.ftchinese.com/register',
 '/users/mystories',
 '/users/setting/index'

In [149]:
from urllib.parse import urljoin

def link_crawler(start_url,link_regex):
    crawl_queue =[start_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        if not html:
            continue
        for link in get_links(html):
            if re.match(link_regex,link):
                abs_link =urljoin(start_url,link)
                crawl_queue.append(abs_link)