# Web Crawler
---

### Imports

In [16]:
import requests
from bs4 import BeautifulSoup as bs
from collections import deque
import urllib.request
import re
import urllib.parse as parse
import urllib.robotparser as rp

## Initialisation

We set an initial URL in accordance with our theme, a maximum number of links to save and a maximum number of links to download the contents of. This way, if the links we get do not want to be crawled, we do not need to go back and save more links. 

In [17]:
URL = 'https://en.wikipedia.org/wiki/Chocolate'
max_links = 50     # max nb of links to crawl
max_cont = 20      # max nb of links to get content of 

## Getting the links

In [18]:
"""This method return at most max_l links, starting from an initial url and getting its children links, 
their children links and so on in a breadth-first fashion.

Returns:
    list
        a list of strings that are the links
"""

def crawl_for_links(Links=[], pointer=0, max_l=max_links):
    
    url = Links[pointer]
    html_page = urllib.request.urlopen(url)   # Open url on internet
    soup = bs(html_page, 'lxml')              # Open contents of url
    
    counter = len(Links)

    # Use regex to extract all links from url, stop if we reach maximum
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        if(counter < max_l)&(link not in Links):
            Links.append(link.get('href'))
            counter += 1
        else:
            break
            
    # If we stll do not have enough links: repeat the process with the next url from the list
    if counter < max_l :
        Links = crawl_for_links(Links, pointer+1, max_l - counter)
        

    return Links

In [19]:
links_crawled = crawl_for_links([URL])
print(links_crawled)
print(len(links_crawled))

['https://en.wikipedia.org/wiki/Chocolate', 'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate', 'http://www.pnas.org/content/108/21/8595', 'http://www.eluniversal.com.mx/notas/526113.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html', 'http://www.bartleby.com/61/68/C0316800.html', 'http://antiquity.ac.uk/projgall/powis/index.html', 'http://news.sciencemag.org/2013/01/earliest-evidence-chocolate-north-america', 'http://www.museum.upenn.edu/new/news/fullrelease.php?which=306', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican5.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican3.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican4.html', 'http://archive.fieldmuseum.org/chocolate/history.html', 'http://findarticles.com/p/articles/mi_m1310/is_1990_Jan/ai_8560999', 'http://www.newyorker.com/reporting/2007/10/29/071029fa_fact_buford',

In [20]:
def save_link (url, filename):
    doc = requests.get(url)
    name = "Documents_crawled/" + filename + ".html"
    with open(name, 'wb') as fOut:
        fOut.write(doc.content)

In [21]:
# Test
save_link (URL, '000')
filename = 3*'0' + '1'
filename

'0001'

In [22]:
# Convention: the filename will be the url's index in the list, on 4 characters
filename = "0000"
nb_zeros = 3
pw = 1
count = 0

for url in links_crawled:
    # Save the link
    save_link(url, filename)
    count += 1
    # Break when we have enough links
    if count >= max_cont : break
        
    # Prepare new filename
    if (count > 10**pw):
        nb_zeros -= 1
        pw += 1 
    # If the file index is above the next power of 10, we add one less 0
    # to the filename than we did before
    filename = nb_zeros*'0' + str (count) 
    

## Handling robots.txt

In [29]:
# Test on URL = wikipedia

# Getting domain url and robots.txt link
## Using urlparse
URL_parsed = parse.urlparse(URL)
print(URL_parsed.hostname)
URL_parsed

robot = URL_parsed.scheme + "://" + URL_parsed.netloc + "/robots.txt"
print(robot)

en.wikipedia.org
https://en.wikipedia.org/robots.txt


In [25]:
## Using urljoin -DOESN'T WORK 
robot = parse.urljoin(URL, "robots.txt")
print(robot)

robot = parse.urljoin(links_crawled[2], "robots.txt")
print(robot)

https://en.wikipedia.org/wiki/robots.txt
http://www.pnas.org/content/108/21/robots.txt


In [30]:
# Robot parsing

RP = rp.RobotFileParser()            # Define Robot parser object

#robot = "https://en.wikipedia.org/robots.txt"
RP.set_url(robot)                    # Set robots.txt link

RP.read()                            # Open robots.txt file
rrate = RP.request_rate("*")         # Return requests per second rate (requests, seconds)
print(RP.crawl_delay("*"))           # Returns crawl delay

RP.can_fetch("*", URL)               # Returns bool if can crawl that url

None


True

For each url:

Get robots.txt url + domain 
Test if can fetch: if yes, crawl
Save in matrix line: domain, crawl delay (20ms if none), time crawled

**Use library robot-parser**

Pseudo-code:

l_servers: list of (server name, crawl_delay) (where crawl_delay = 20ms by default)

if link's server is in list: wait for (crawl_delay time)

if ("User-agent: \*" not in robots.txt): get full contents
else:
    line = 1st line under "User-agent: \*"
    if ("User agent:\* \n Allow: \" in robots.txt; or "Allow: \" in line): get full contents of link
    if ("User agent:\* \n Disallow: \" in robots.txt; or "Disallow: \" in line): break
    else:
        while ("Allow" in line) | ("Disallow" in line):
            if "Allow" in line:
                s = root + line - "Allow: "
                if s in link: get full contents of link
            if "Disallow" in line: 
                s = root + line - "Disallow: "
                if s in link: break
            line = next line
        # If we have reached end of specifications for crawlers without explicit allowance or disallowance for our
        # url: get full contents 
        if ("Allow" not in line) & ("Disallow" not in line):
            get full contents of link