# Web Crawler
---

### Imports

In [16]:
import re
import requests
from collections import deque

from bs4 import BeautifulSoup as bs

import urllib.request
import urllib.parse as parse
import urllib.robotparser as rp

## Initialisation

We set an initial URL in accordance with our theme, a maximum number of links to save and a maximum number of links to download the contents of. This way, if the links we get do not want to be crawled, we do not need to go back and save more links. 

In [2]:
URL = 'https://en.wikipedia.org/wiki/Chocolate'
max_links = 50     # max nb of links to crawl
max_cont = 20      # max nb of links to get content of 

## Getting the links

In [17]:
# Use regex to extract all links from url, stop if we reach maximum
def get_all_links (url, counter, Links = [], max_l=max_links):
    
    html_page = urllib.request.urlopen(url)   # Open url on internet
    soup = bs(html_page, 'lxml')              # Open contents of url
    
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        if(counter < max_l)&(link not in Links):
            Links.append(link.get('href'))
            counter += 1
        else:
            break
            
    return (Links, counter)

In [18]:
"""This method return at most max_l links, starting from an initial url and getting its children links, 
their children links and so on in a breadth-first fashion.

Returns:
    list
        a list of strings that are the links
"""

def crawl_for_links(Links=[], pointer=0, max_l=max_links):
    
    url = Links[pointer]
    counter = len(Links)
        
    Links, counter = get_all_links (url, counter, Links, max_l)
    #html_page = urllib.request.urlopen(url)   # Open url on internet
    #soup = bs(html_page, 'lxml')              # Open contents of url
      

    #for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
    #    if(counter < max_l)&(link not in Links):
    #        Links.append(link.get('href'))
    #        counter += 1
    #    else:
    #        break
            
    # If we stll do not have enough links: repeat the process with the next url from the list
    if counter < max_l :
        Links = crawl_for_links(Links, pointer+1, max_l - counter)
        

    return Links

In [19]:
links_crawled = crawl_for_links([URL])
print(links_crawled)
print(len(links_crawled))

['https://en.wikipedia.org/wiki/Chocolate', 'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate', 'http://www.pnas.org/content/108/21/8595', 'http://www.eluniversal.com.mx/notas/526113.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html', 'http://www.bartleby.com/61/68/C0316800.html', 'http://antiquity.ac.uk/projgall/powis/index.html', 'http://news.sciencemag.org/2013/01/earliest-evidence-chocolate-north-america', 'http://www.museum.upenn.edu/new/news/fullrelease.php?which=306', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican5.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican3.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican4.html', 'http://archive.fieldmuseum.org/chocolate/history.html', 'http://findarticles.com/p/articles/mi_m1310/is_1990_Jan/ai_8560999', 'http://www.newyorker.com/reporting/2007/10/29/071029fa_fact_buford',

In [13]:
def save_link (url, filename):
    doc = requests.get(url)
    # Put here the path where you want to save the link's contents
    name = "Docs_crawled2/" + filename + ".html"
    with open(name, 'wb') as fOut:
        fOut.write(doc.content)

In [21]:
# Test
save_link (URL, '000')
filename = 3*'0' + '1'
filename

'0001'

In [14]:
# Convention: the filename will be the url's index in the list, on 4 characters

# Prepare new filename
def new_filename (count, fname = '0000', conv = [3, 1]):
    nb_zeros = conv[0]
    pw = conv[1]
    
    if (count > 10**pw):
        nb_zeros -= 1
        pw += 1 
    # If the file index is above the next power of 10, we add one less 0
    # to the filename than we did before
    filename = nb_zeros*'0' + str (count)
    
    return (filename, conv)

In [20]:
filename = "0000"
nb_zeros = 3
pw = 1

conventions = [nb_zeros, pw]

count = 0

for url in links_crawled:
    # Save the link
    save_link(url, filename)
    count += 1
    # Break when we have enough links
    if count >= max_cont : break
        
    filename, conventions = new_filename(count, filename, conventions)
    

## Handling robots.txt

In [6]:
# Test on URL = wikipedia

# Getting domain url and robots.txt link
## Using urlparse
URL_parsed = parse.urlparse(URL)
print(URL_parsed.hostname)
URL_parsed

robot = URL_parsed.scheme + "://" + URL_parsed.netloc + "/robots.txt"
print(robot)

en.wikipedia.org
https://en.wikipedia.org/robots.txt


In [25]:
## Using urljoin -DOESN'T WORK 
robot = parse.urljoin(URL, "robots.txt")
print(robot)

robot = parse.urljoin(links_crawled[2], "robots.txt")
print(robot)

https://en.wikipedia.org/wiki/robots.txt
http://www.pnas.org/content/108/21/robots.txt


In [7]:
# Robot parsing

RP = rp.RobotFileParser()            # Define Robot parser object

#robot = "https://en.wikipedia.org/robots.txt"
RP.set_url(robot)                    # Set robots.txt link

RP.read()                            # Open robots.txt file
rrate = RP.request_rate("*")         # Return requests per second rate (requests, seconds)
print(RP.crawl_delay("*"))           # Returns crawl delay

RP.can_fetch("*", URL)               # Returns bool if can crawl that url

None


True

## Function bringing it all together 

Pseudocode: Crawler (nb_max_url, [URL], Domains, Links):

Get robots.txt url + domain 
Save domain, robots.txt url and crawl delay if don't already have it

If can fetch: 

    If domain in domains & time crawled > 0:
        Wait max(0, crawl delay - (current time - time crawled))
    
    Crawl: save contents 
    Update time crawled
    Save links if we still need more
    Save in matrix domains line: domain, crawl delay (20ms if none), time crawled

If haven't crawled enough links: repeat
    
Links = list of links we can crawl
Domains: [domain, robots.txt link, crawl delay, time last crawled]
(several links can have same domain)

Auxiliary functions: extract up to M links from a page, new filename, save a link (all checked individually, work)

To be able to wait for crawler delay time, need functions get current time, wait a certain time (need to know units of said time!!!)


In [None]:
def Crawler (Domains= [[]], Links=[], pointer=0, fname = '0000', max_l=max_links, max_c = max_cont):
          
    url = Links[pointer]
    already_crawled = True
    counter = len(Links)
    
    # Get url domain
    URL_parsed = parse.urlparse(url)
    dom = URL_parsed.netloc
    
    # If we have not already tried to crawl in the domain
    if (dom not in Domains[, 0]):
        already_crawled = False
        # Get robots.txt link
        robot = URL_parsed.scheme + "://" + dom + "/robots.txt"
        Domains.append([dom, robot, 20, -1])
    
    i = Domains[, 0].index(dom)
    robot = Domains[i, 1]

    # Access robots.txt
    RP = rp.RobotFileParser()         
    RP.set_url(robot)                    
    RP.read()
    
    # Get crawl delay (if it is given and we don't already have it)
    if (not already_crawled) & (RP.crawl_delay("*") != None):
        Domains[i, 2] = RP.crawl_delay("*")
    
    # If we can crawl the file
    if (RP.can_fetch("*", URL)):
        
        # Wait appropriate time if we have already crawled the domain
        #if (already_crawled):
        #    Wait max(0, crawl delay - (current time - time crawled))
        
        # Save contents of the link
        save_link(url, fname)
        # Domains[i, 3] = current_time                  # update time we last crawled the domain
        
        # Prepare filename for next page
        pointer += 1
        fname = filename(fname, pointer, conv)
        
        # If we do not have enough links, save those referencec in the page
        if len(Links) < max_l:
            Links, counter = get_all_links(url, counter, Links, max_l)
    
    # If we have not saved enough links: repeat
    if (pointer < max_c):
        Domains, Links = Crawler (Domains, Links, pointer, fname, max(0, max_l - counter), max_c - pointer)
        
    return (Domains, Links)           

In [22]:
# Test list methods on Domains

D = [[1, "a", 20, 1], [2, "b", 20, 2], [3, "c", 20, 4]]

print(D.index(2))

ValueError: 2 is not in list