# Web Crawler
---

### Imports

In [3]:
import re
import requests
from collections import deque

from bs4 import BeautifulSoup as bs

import urllib.request
import urllib.parse as parse
import urllib.robotparser as rp

from socket import gethostbyname, gaierror

import time

import pandas as pd

## Initialisation

We set an initial URL in accordance with our theme, a maximum number of links to save and a maximum number of links to download the contents of. This way, if the links we get do not want to be crawled, we do not need to go back and save more links. 

In [4]:
URL = 'https://en.wikipedia.org/wiki/Chocolate'
max_links = 50     # max nb of links to crawl
max_cont = 20      # max nb of links to get content of 

## Getting the links

We use one function to retrieve all external links in the page (links to the same domain do not begin by htts, and as such are not spotted by Beautiful Soup).

In [5]:
# Use regex to extract all links from url, stop if we reach maximum
def get_all_links (url, counter, Links = [], max_l=max_links):
    
    html_page = urllib.request.urlopen(url)   # Open url on internet
    soup = bs(html_page, 'lxml')              # Open contents of url
    
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        if(counter < max_l)&(link not in Links):
            Links.append(link.get('href'))
            counter += 1
        else:
            break
            
    return (Links, counter)

In [None]:
# Test
links_crawled, counter = get_all_links (url, counter)
print(len(links_crawled))
print(links_crawled)

## Saving the contents

This function saves the link under a specified filename. The path for that file can be changed in the code.

In [9]:
def save_link (url, filename):
    doc = requests.get(url)
    
    # Put here the path where you want to save the link's contents
    name = "Docs_crawled_noRobot/" + filename + ".html"
    
    with open(name, 'wb') as fOut:
        fOut.write(doc.content)

In [21]:
# Test
save_link (URL, '000')
filename = 3*'0' + '1'
filename

'0001'

We chose to have each file's name be its index in the list of links, on 4 characters (for instance: 0008 for the 9th link).

In [10]:
# Prepare new filename
def new_filename (count, fname = '0000', conv = [3, 1]):
    nb_zeros = conv[0]
    pw = conv[1]
    
    if (count > 10**pw):
        nb_zeros -= 1
        pw += 1 
    # If the file index is above the next power of 10, we add one less 0
    # to the filename than we did before
    filename = nb_zeros*'0' + str (count)
    
    return (filename, conv)

In [20]:
# Test 

filename = "0000"
nb_zeros = 3
pw = 1

conventions = [nb_zeros, pw]

count = 0

for url in links_crawled:
    # Save the link
    save_link(url, filename)
    count += 1
    # Break when we have enough links
    if count >= max_cont : break
        
    filename, conventions = new_filename(count, filename, conventions)
    

## Crawling without considering robots.txt

This method return at most max_c links and saves at most max_c, starting from an initial url and getting its children links, their children links and so on in a breadth-first fashion.   

Returns:
    list - a list of strings that are the links

In [24]:
def Crawler_noRespect(L = [URL], pointer=0, last_expanded = 0, fname = '0000', \
                      max_l=max_links, max_c = max_cont, conv = [3, 1]):
    
    counter = len(L)
    url = L[pointer]

    # Save contents of the link
    save_link(url, fname)
    max_c -= 1                 

    # Prepare filename for next page
    pointer += 1
    fname, conv = new_filename(pointer, fname, conv)

    # If we do not have enough links, save those referenced in the page
    if (len(L) < max_l):
        L, counter = get_all_links(url, counter, L, max_l)
        last_expanded +=1
    
    # If we are close to the end of the queue of links and have not saved enough content yet, get more 
    if (pointer >= (len(L) - 2)):
        l = len(L)
        while (len(L) == l):
            L, counter2 = get_all_links(L[last_expanded], 0, L, max_l)
            last_expanded +=1
            counter += counter2

    # If we have not saved enough links: repeat
    if (max_c > 0):
        L = Crawler_noRespect (L, pointer, last_expanded, fname, max_l, max_c, conv)
    
    return (L)           

In [25]:
# Test

L_noRobots = Crawler_noRespect()

## Handling robots.txt permissions and crawler delays

We save lists of domains, robots.txt urls, crawler delays and times when each url's domain was last crawled. The information on the same url will be at the same index i in all lists (i.e. in line i of the matrix D). We then use these to wait the correct amount of time before crawling each domain a second time. 

In [32]:
domains = []
robots = []
delays = []
time_crawled = []

Links = [URL]

In [89]:
def Crawler (D = [domains, robots, delays, time_crawled], L=Links, pointer=0, last_expanded = 0, \
             fname = '0000', max_l=max_links, max_c = max_cont, conv = [3, 1]):
    
        counter = len(L)
    #print("Nb links in Links: ", counter)
    print("Pointer: ", pointer)
    already_crawled = True
    
    # Issue with some links: don't parse them        
    #if not (pointer in [1, 4]):
    
    url = L[pointer]
    print(url)

    # Get url domain
    URL_parsed = parse.urlparse(url)
    dom = URL_parsed.netloc
    #print("Domain: ", dom)

    # If we have not already tried to crawl in the domain
    if not (dom in domains):

        #print("New domain")
        already_crawled = False        
        # Get robots.txt link
        r = URL_parsed.scheme + "://" + dom + "/robots.txt"
        #print("Robots.txt link: ", r)

        domains.append(dom)
        robots.append(r)
        delays.append(20)
        time_crawled.append(-1)


    i = domains.index(dom)
    #print("Domain index: ", i)
    r = robots[i]
    #print("Robots.txt link: ", r)

    # If our domain is not dead (xocoatl refuses all robots)
    if (not dom in ["www.cfsan.fda.gov", "www.cocoatree.org", "nca.files.cms-plus.com", \
                    "www.xocoatl.org"]):
        
        # Access robots.txt
        RP = rp.RobotFileParser()         
        RP.set_url(r) 
        RP.read()

        # If the robot parser correctly opens the robots.txt. 
        # This test is actually too harsh, but it's better to not parse some links that could be saved rather than 
        # stop the whole process (some robots.txt links actually send us directly to the home domain first, or don't exist)
        if (len(RP.entries) != 0):

            # Get crawl delay (if it is given and we don't already have it)
            if ((not already_crawled) & (RP.crawl_delay("*") != None)):
                delays.pop(i) 
                delays.insert(i, RP.crawl_delay("*"))

            # If we can crawl the file
            if (RP.can_fetch("*", url)):

                # Wait appropriate time if we have already crawled the domain
                if (already_crawled):
                    wait = max(0, delays[i] - (time.time() - time_crawled[i]))
                    if (wait > 0): 
                               time.sleep(wait)

                # Save contents of the link
                save_link(url, fname)
                max_c -= 1
                print("We are saving the contents of ", url)

                # Update time we last crawled the domain
                time_crawled.pop(i)
                time_crawled.insert(i, time.time())                  

                # Prepare filename for next page
                fname, conv = new_filename(pointer +1, fname, conv)

                # If we do not have enough links, save those referenced in the page
                if (len(L) < max_l):
                    L, counter = get_all_links(L[last_expanded], counter, L, max_l)
                    last_expanded += 1

    # If we are close to the end of the queue of links and have not saved enough content yet, get more 
    if (pointer >= (len(L) - 2)):
        l = len(L)            
        while (len(L) == l):
            
            # Get to the first node that we have crawled since the last node we expanded
            while(time_crawled[last_expanded] == -1):
                last_expanded +=1
            ######
            ###This is where the issue is: we are not allowed to get the links from
            ###L[last_expanded]
            ### -> check: it should be at pointer 13, 
            ### http://www.newyorker.com/reporting/2007/10/29/071029fa_fact_buford
            #######
            L, counter2 = get_all_links(L[last_expanded], 0, L, max_l)
            counter += counter2
            last_expanded += 1
        
    # Whatever happens, we move on to the next file
    pointer += 1
    
    print(D)
    #print(L)
    
    # If we have not saved enough links: repeat
    if (max_c > 0):
        D, L = Crawler (D, L, pointer, last_expanded, fname, max_l, max_c, conv)
    
    return (D, L)    

In [90]:
# Emptying lists from previous tests
for l in [domains, robots, delays, time_crawled]:
    while len(l) > 0:
            l.pop(0)

while (len(Links) > 1):
    Links.pop(1)

D, Links = Crawler()

# RecursionError: maximum recursion depth exceeded while calling a Python object
# Parser error: should be able to crawl links 2 & 3 but doesn't

Pointer:  0
https://en.wikipedia.org/wiki/Chocolate
We are saving the contens of  https://en.wikipedia.org/wiki/Chocolate
[['en.wikipedia.org'], ['https://en.wikipedia.org/robots.txt'], [20], [1584644480.0399082]]
Pointer:  1
http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate
[['en.wikipedia.org', 'ndb.nal.usda.gov'], ['https://en.wikipedia.org/robots.txt', 'http://ndb.nal.usda.gov/robots.txt'], [20, 20], [1584644480.0399082, -1]]
Pointer:  2
http://www.eluniversal.com.mx/notas/526113.html
[['en.wikipedia.org', 'ndb.nal.usda.gov', 'www.eluniversal.com.mx'], ['https://en.wikipedia.org/robots.txt', 'http://ndb.nal.usda.gov/robots.txt', 'http://www.eluniversal.com.mx/robots.txt'], [20, 20, 20], [1584644480.0399082, -1, -1]]
Pointer:  3
http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html
[['en.wikipedia.org', 'ndb.nal.usda.gov', 'www.eluniversal.com.mx', 'archive.fieldmuseum.org'], ['https://

We are saving the contens of  http://www.smithsonianmag.com/arts-culture/a-brief-history-of-chocolate-21860917/?no-ist
[['en.wikipedia.org', 'ndb.nal.usda.gov', 'www.eluniversal.com.mx', 'archive.fieldmuseum.org', 'www.bartleby.com', 'antiquity.ac.uk', 'news.sciencemag.org', 'www.museum.upenn.edu', 'findarticles.com', 'www.newyorker.com', 'www.exploratorium.edu', 'www.smithsonianmag.com'], ['https://en.wikipedia.org/robots.txt', 'http://ndb.nal.usda.gov/robots.txt', 'http://www.eluniversal.com.mx/robots.txt', 'http://archive.fieldmuseum.org/robots.txt', 'http://www.bartleby.com/robots.txt', 'http://antiquity.ac.uk/robots.txt', 'http://news.sciencemag.org/robots.txt', 'http://www.museum.upenn.edu/robots.txt', 'http://findarticles.com/robots.txt', 'http://www.newyorker.com/robots.txt', 'http://www.exploratorium.edu/robots.txt', 'http://www.smithsonianmag.com/robots.txt'], [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20], [1584644480.0399082, -1, -1, -1, -1, -1, -1, -1, -1, 1584644489.221

[['en.wikipedia.org', 'ndb.nal.usda.gov', 'www.eluniversal.com.mx', 'archive.fieldmuseum.org', 'www.bartleby.com', 'antiquity.ac.uk', 'news.sciencemag.org', 'www.museum.upenn.edu', 'findarticles.com', 'www.newyorker.com', 'www.exploratorium.edu', 'www.smithsonianmag.com', 'hdl.handle.net', 'www.history.com', 'www.etymonline.com', 'eur-lex.europa.eu', 'www.cfsan.fda.gov', 'www.worldcocoafoundation.org'], ['https://en.wikipedia.org/robots.txt', 'http://ndb.nal.usda.gov/robots.txt', 'http://www.eluniversal.com.mx/robots.txt', 'http://archive.fieldmuseum.org/robots.txt', 'http://www.bartleby.com/robots.txt', 'http://antiquity.ac.uk/robots.txt', 'http://news.sciencemag.org/robots.txt', 'http://www.museum.upenn.edu/robots.txt', 'http://findarticles.com/robots.txt', 'http://www.newyorker.com/robots.txt', 'http://www.exploratorium.edu/robots.txt', 'http://www.smithsonianmag.com/robots.txt', 'http://hdl.handle.net/robots.txt', 'http://www.history.com/robots.txt', 'http://www.etymonline.com/robo

[['en.wikipedia.org', 'ndb.nal.usda.gov', 'www.eluniversal.com.mx', 'archive.fieldmuseum.org', 'www.bartleby.com', 'antiquity.ac.uk', 'news.sciencemag.org', 'www.museum.upenn.edu', 'findarticles.com', 'www.newyorker.com', 'www.exploratorium.edu', 'www.smithsonianmag.com', 'hdl.handle.net', 'www.history.com', 'www.etymonline.com', 'eur-lex.europa.eu', 'www.cfsan.fda.gov', 'www.worldcocoafoundation.org', 'www.spiritofaloha.com', 'news.bbc.co.uk', 'thecnnfreedomproject.blogs.cnn.com', 'www.pantagraph.com', 'www.bbc.co.uk', 'worldcocoafoundation.org'], ['https://en.wikipedia.org/robots.txt', 'http://ndb.nal.usda.gov/robots.txt', 'http://www.eluniversal.com.mx/robots.txt', 'http://archive.fieldmuseum.org/robots.txt', 'http://www.bartleby.com/robots.txt', 'http://antiquity.ac.uk/robots.txt', 'http://news.sciencemag.org/robots.txt', 'http://www.museum.upenn.edu/robots.txt', 'http://findarticles.com/robots.txt', 'http://www.newyorker.com/robots.txt', 'http://www.exploratorium.edu/robots.txt', 

URLError: <urlopen error [Errno 110] Connection timed out>