# Web Crawler
---

### Imports

In [113]:
import re
import requests
from collections import deque

from bs4 import BeautifulSoup as bs

import urllib.request
import urllib.parse as parse
import urllib.robotparser as rp

from socket import gethostbyname, gaierror

import time

import pandas as pd

from socket import gethostbyname, gaierror

## Initialisation

We set an initial URL in accordance with our theme, a maximum number of links to save and a maximum number of links to download the contents of. This way, if the links we get do not want to be crawled, we do not need to go back and save more links as often. 

In [87]:
URL = 'https://en.wikipedia.org/wiki/Chocolate'
max_links = 80     # max nb of links to crawl
max_cont = 10      # max nb of links to get content of 

## Getting the links

We use one function to retrieve all external links in the page (links to the same domain do not begin by htts, and as such are not spotted by Beautiful Soup).

In [127]:
# Use regex to extract all links from url, stop if we reach maximum
def get_all_links (url, counter, Links = [], max_l=max_links):
    
    html_page = urllib.request.urlopen(url)   # Open url on internet
    soup = bs(html_page, 'lxml')              # Open contents of url
    
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        print("Counter = {} - max_l = {}".format(counter, max_l))
        if(counter < max_l)&(link not in Links):
            save_link(link.get('href'), str(counter))
            Links.append(link.get('href'))
            counter += 1
        else:
            break
            
    return (Links, counter)

In [4]:
"""This method return at most max_l links, starting from an initial url and getting its children links, 
their children links and so on in a breadth-first fashion.

Returns:
    list
        a list of strings that are the links
"""

def crawl_for_links(Links=[], pointer=0, max_l=max_links):
    
    url = Links[pointer]
    counter = len(Links)
        
    Links, counter = get_all_links (url, counter, Links, max_l)
    
    # If we still do not have enough links: repeat the process with the next url from the list
    if counter < max_l :
        Links = crawl_for_links(Links, pointer+1, max_l - counter)
        

    return Links

In [5]:
links_crawled = crawl_for_links([URL])
print(links_crawled)
print(len(links_crawled))

['https://en.wikipedia.org/wiki/Chocolate', 'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate', 'http://www.eluniversal.com.mx/notas/526113.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html', 'http://www.bartleby.com/61/68/C0316800.html', 'http://antiquity.ac.uk/projgall/powis/index.html', 'http://news.sciencemag.org/2013/01/earliest-evidence-chocolate-north-america', 'http://www.museum.upenn.edu/new/news/fullrelease.php?which=306', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican5.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican3.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican4.html', 'http://archive.fieldmuseum.org/chocolate/history.html', 'http://findarticles.com/p/articles/mi_m1310/is_1990_Jan/ai_8560999', 'http://www.newyorker.com/reporting/2007/10/29/071029fa_fact_buford', 'http://www.exploratorium.edu/exploring/ex

In [119]:
def save_link (url, filename):
    try:
        doc = requests.get(url.strip())
        # Put here the path where you want to save the link's contents
        name = "Docs_crawled4/" + filename + ".html"
        with open(name, 'wb') as fOut:
            fOut.write(doc.content)
    except gaierror:
        print("Link not reachable")
    except requests.exceptions.ConnectionError:
        print("Connection Error")

In [7]:
# Test
save_link (URL, '000')
filename = 3*'0' + '1'
filename

'0001'

In [8]:
# Convention: the filename will be the url's index in the list, on 4 characters

# Prepare new filename
def new_filename (count, fname = '0000', conv = [3, 1]):
    nb_zeros = conv[0]
    pw = conv[1]
    
    if (count > 10**pw):
        nb_zeros -= 1
        pw += 1 
    # If the file index is above the next power of 10, we add one less 0
    # to the filename than we did before
    filename = nb_zeros*'0' + str (count)
    
    return (filename, conv)

In [9]:
filename = "0000"
nb_zeros = 3
pw = 1

conventions = [nb_zeros, pw]

count = 0

for url in links_crawled:
    # Save the link
    save_link(url, filename)
    count += 1
    # Break when we have enough links
    if count >= max_cont : break
        
    filename, conventions = new_filename(count, filename, conventions)
    

## Crawling without considering robots.txt

In [10]:
def Crawler_noRespect(L = [URL], pointer=0, last_expanded = 0, fname = '0000', \
                      max_l=max_links, max_c = max_cont, conv = [3, 1]):
    
    counter = len(L)
    url = L[pointer]

    # Save contents of the link
    save_link(url, fname)
    max_c -= 1                 

    # Prepare filename for next page
    pointer += 1
    fname, conv = new_filename(pointer, fname, conv)

    # If we do not have enough links, save those referenced in the page
    if (len(L) < max_l):
        L, counter = get_all_links(url, counter, L, max_l)
    
    # If we are close to the end of the queue of links and have not saved enough content yet, get more 
    if (pointer >= (len(L) - 2)):
        l = len(L)
        while (len(L) == l):
            last_expanded +=1
            L, counter2 = get_all_links(L[last_expanded], 0, L, max_l)
            counter += counter2

    # If we have not saved enough links: repeat
    if (max_c > 0):
        L = Crawler_noRespect (L, pointer, last_expanded, fname, max_l, max_c, conv)
    
    return (L)        

In [11]:
# Test: works 100% fine

L_noRobots = Crawler_noRespect()

## Handling robots.txt

In [12]:
# Test on URL = wikipedia

# Getting domain url and robots.txt link
## Using urlparse
URL_parsed = parse.urlparse(URL)
print(URL_parsed.netloc)
URL_parsed

robot = URL_parsed.scheme + "://" + URL_parsed.netloc + "/robots.txt"
print(robot)

en.wikipedia.org
https://en.wikipedia.org/robots.txt


In [13]:
## Using urljoin -DOESN'T WORK 
robot = parse.urljoin(URL, "robots.txt")
print(robot)

robot = parse.urljoin(links_crawled[2], "robots.txt")
print(robot)

https://en.wikipedia.org/wiki/robots.txt
http://www.eluniversal.com.mx/notas/robots.txt


In [14]:
# Robot parsing

RP = rp.RobotFileParser()            # Define Robot parser object

#robot = "https://en.wikipedia.org/robots.txt"
RP.set_url(robot)                    # Set robots.txt link

RP.read()                            # Open robots.txt file
rrate = RP.request_rate("*")         # Return requests per second rate (requests, seconds)
print(RP.crawl_delay("*"))           # Returns crawl delay

RP.can_fetch("*", URL)               # Returns bool if can crawl that url

None


False

In [15]:
# Check if robots.txt exists -> doesn't work for our second link
def does_url_exist(url):
    try: 
        r = requests.head(url)
        if r.status_code < 400:
            print(1)
            return True
        else:
            print(0)
            return False
    except requests.exceptions.RequestException as e:
        print(e)
        # handle your exception

## Function bringing it all together 

Pseudocode: Crawler (nb_max_url, [URL], Domains, Links):

Get robots.txt url + domain 
Save domain, robots.txt url and crawl delay if don't already have it

If can fetch: 

    If domain in domains & time crawled > 0:
        Wait max(0, crawl delay - (current time - time crawled))
    
    Crawl: save contents 
    Update time crawled
    Save links if we still need more
    Save in matrix domains line: domain, crawl delay (20ms if none), time crawled

If haven't crawled enough links: repeat
    
Links = list of links we can crawl
Domains: [domain, robots.txt link, crawl delay, time last crawled]
(several links can have same domain)

Auxiliary functions: extract up to M links from a page, new filename, save a link (all checked individually, work)

To be able to wait for crawler delay time, need functions get current time, wait a certain time (need to know units of said time!!!)


### Function with Domain matrix as list of lists

In [22]:
domains = []
robots = []
delays = []
time_crawled = []

Links = [URL]

In [128]:
# Method with D as list of lists (domains, robots, delay, time_crawled)

def Crawler (D = [domains, robots, delays, time_crawled], L=Links, pointer=0, \
             fname = '0000', max_l=max_links, max_c = max_cont, conv = [3, 1]):
    
    counter = len(L)
    #print("Nb links in Links: ", counter)
    #print("Pointer: ", pointer)
    already_crawled = True
    
    # Issue with some links: don't parse them        
    #if not (pointer in [1, 4]):
    
    url = L[pointer]
    print(url)

    # Get url domain
    URL_parsed = parse.urlparse(url)
    dom = URL_parsed.netloc
    #print("Domain: ", dom)

    # If we have not already tried to crawl in the domain
    if not (dom in domains):

        #print("New domain")
        already_crawled = False        
        # Get robots.txt link
        r = URL_parsed.scheme + "://" + dom + "/robots.txt"
        #print("Robots.txt link: ", r)

        domains.append(dom)
        robots.append(r)
        delays.append(20)
        time_crawled.append(-1)


    i = domains.index(dom)
    #print("Domain index: ", i)
    r = robots[i]
    #print("Robots.txt link: ", r)

    # If our domain is not dead (xocoatl refuses all robots)
    if (not dom in ["www.cfsan.fda.gov", "www.cocoatree.org", "nca.files.cms-plus.com", \
                    "www.xocoatl.org", "www.cfsan.fda.gov"]):
        
        # Access robots.txt
        RP = rp.RobotFileParser()         
        RP.set_url(r) 
        RP.read()

        # If the robot parser correctly opens the robots.txt. 
        # This test is actually too harsh, but it's better to not parse some links that could be saved rather than 
        # stop the whole process (some robots.txt links actually send us directly to the home domain first, or don't exist)
        if (len(RP.entries) != 0):

            # Get crawl delay (if it is given and we don't already have it)
            if ((not already_crawled) & (RP.crawl_delay("*") != None)):
                delays.pop(i) 
                delays.insert(i, RP.crawl_delay("*"))

            # If we can crawl the file
            if (RP.can_fetch("*", url)):

                # Wait appropriate time if we have already crawled the domain
                if (already_crawled):
                    wait = max(0, delays[i] - (time.time() - time_crawled[i]))
                    if (wait > 0): 
                               time.sleep(wait)

                
                max_c -= 1
                print("\n MAX_C - > {}".format(max_c))
                print("We are saving the contents of ", url)

                # Update time we last crawled the domain
                time_crawled.pop(i)
                time_crawled.insert(i, time.time())                  

                # Prepare filename for next page
                fname, conv = new_filename(pointer + 1, fname, conv)

                # If we do not have enough links, save those referenced in the page
                if (len(L) < max_l):
                    L, counter = get_all_links(L[pointer], counter, L, max_l)
    print(D)
    #print(L)
    
    # If we have not saved enough links: repeat
    if (counter < max_l):
        pointer += 1
        print("COUNTER_IF = {} - max_l = {}, pointer = {}".format(counter, max_l, pointer))
        D, L = Crawler (D, L, pointer, fname, max_l, max_c, conv)
    else:
        return (D, L)
    
    return (D, L)           

In [129]:
# Emptying lists from previous tests
for l in [domains, robots, delays, time_crawled]:
    while len(l) > 0:
            l.pop(0)

while (len(Links) > 1):
    Links.pop(1)

# Save contents of the root link
save_link(URL, "000")
D, Links = Crawler()

https://en.wikipedia.org/wiki/Chocolate

 MAX_C - > 9
We are saving the contents of  https://en.wikipedia.org/wiki/Chocolate
Counter = 1 - max_l = 80
Counter = 2 - max_l = 80
Counter = 3 - max_l = 80
Counter = 4 - max_l = 80
Counter = 5 - max_l = 80
Counter = 6 - max_l = 80
Counter = 7 - max_l = 80
Counter = 8 - max_l = 80
Counter = 9 - max_l = 80
Counter = 10 - max_l = 80
Counter = 11 - max_l = 80
Counter = 12 - max_l = 80
Counter = 13 - max_l = 80
Counter = 14 - max_l = 80
Counter = 15 - max_l = 80
Counter = 16 - max_l = 80
Counter = 17 - max_l = 80
Counter = 18 - max_l = 80
Counter = 19 - max_l = 80
Counter = 20 - max_l = 80
Counter = 21 - max_l = 80
Connection Error
Counter = 22 - max_l = 80
Counter = 23 - max_l = 80
Counter = 24 - max_l = 80
Counter = 25 - max_l = 80
Counter = 26 - max_l = 80
Counter = 27 - max_l = 80
Counter = 28 - max_l = 80
Counter = 29 - max_l = 80
Counter = 30 - max_l = 80
Counter = 31 - max_l = 80
Counter = 32 - max_l = 80
Counter = 33 - max_l = 80
Counter =