# Web Crawler
---

### Imports

In [1]:
import re
import requests
from collections import deque

from bs4 import BeautifulSoup as bs

import urllib.request
import urllib.parse as parse
import urllib.robotparser as rp

import time

import pandas as pd

## Initialisation

We set an initial URL in accordance with our theme, a maximum number of links to save and a maximum number of links to download the contents of. This way, if the links we get do not want to be crawled, we do not need to go back and save more links. 

In [2]:
URL = 'https://en.wikipedia.org/wiki/Chocolate'
max_links = 50     # max nb of links to crawl
max_cont = 20      # max nb of links to get content of 

## Getting the links

In [3]:
# Use regex to extract all links from url, stop if we reach maximum
def get_all_links (url, counter, Links = [], max_l=max_links):
    
    html_page = urllib.request.urlopen(url)   # Open url on internet
    soup = bs(html_page, 'lxml')              # Open contents of url
    
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        if(counter < max_l)&(link not in Links):
            Links.append(link.get('href'))
            counter += 1
        else:
            break
            
    return (Links, counter)

In [30]:
"""This method return at most max_l links, starting from an initial url and getting its children links, 
their children links and so on in a breadth-first fashion.

Returns:
    list
        a list of strings that are the links
"""

def crawl_for_links(Links=[], pointer=0, max_l=max_links):
    
    url = Links[pointer]
    counter = len(Links)
        
    Links, counter = get_all_links (url, counter, Links, max_l)
    #html_page = urllib.request.urlopen(url)   # Open url on internet
    #soup = bs(html_page, 'lxml')              # Open contents of url
      

    #for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
    #    if(counter < max_l)&(link not in Links):
    #        Links.append(link.get('href'))
    #        counter += 1
    #    else:
    #        break
            
    # If we stll do not have enough links: repeat the process with the next url from the list
    if counter < max_l :
        Links = crawl_for_links(Links, pointer+1, max_l - counter)
        

    return Links

In [19]:
links_crawled = crawl_for_links([URL])
print(links_crawled)
print(len(links_crawled))

['https://en.wikipedia.org/wiki/Chocolate', 'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate', 'http://www.pnas.org/content/108/21/8595', 'http://www.eluniversal.com.mx/notas/526113.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html', 'http://www.bartleby.com/61/68/C0316800.html', 'http://antiquity.ac.uk/projgall/powis/index.html', 'http://news.sciencemag.org/2013/01/earliest-evidence-chocolate-north-america', 'http://www.museum.upenn.edu/new/news/fullrelease.php?which=306', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican5.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican3.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican4.html', 'http://archive.fieldmuseum.org/chocolate/history.html', 'http://findarticles.com/p/articles/mi_m1310/is_1990_Jan/ai_8560999', 'http://www.newyorker.com/reporting/2007/10/29/071029fa_fact_buford',

In [4]:
def save_link (url, filename):
    doc = requests.get(url)
    # Put here the path where you want to save the link's contents
    name = "Docs_crawled4/" + filename + ".html"
    with open(name, 'wb') as fOut:
        fOut.write(doc.content)

In [21]:
# Test
save_link (URL, '000')
filename = 3*'0' + '1'
filename

'0001'

In [5]:
# Convention: the filename will be the url's index in the list, on 4 characters

# Prepare new filename
def new_filename (count, fname = '0000', conv = [3, 1]):
    nb_zeros = conv[0]
    pw = conv[1]
    
    if (count > 10**pw):
        nb_zeros -= 1
        pw += 1 
    # If the file index is above the next power of 10, we add one less 0
    # to the filename than we did before
    filename = nb_zeros*'0' + str (count)
    
    return (filename, conv)

In [20]:
filename = "0000"
nb_zeros = 3
pw = 1

conventions = [nb_zeros, pw]

count = 0

for url in links_crawled:
    # Save the link
    save_link(url, filename)
    count += 1
    # Break when we have enough links
    if count >= max_cont : break
        
    filename, conventions = new_filename(count, filename, conventions)
    

## Handling robots.txt

In [94]:
# Test on URL = wikipedia

# Getting domain url and robots.txt link
## Using urlparse
URL_parsed = parse.urlparse(URL)
print(URL_parsed.netloc)
URL_parsed

robot = URL_parsed.scheme + "://" + URL_parsed.netloc + "/robots.txt"
print(robot)

en.wikipedia.org
https://en.wikipedia.org/robots.txt


In [25]:
## Using urljoin -DOESN'T WORK 
robot = parse.urljoin(URL, "robots.txt")
print(robot)

robot = parse.urljoin(links_crawled[2], "robots.txt")
print(robot)

https://en.wikipedia.org/wiki/robots.txt
http://www.pnas.org/content/108/21/robots.txt


In [7]:
# Robot parsing

RP = rp.RobotFileParser()            # Define Robot parser object

#robot = "https://en.wikipedia.org/robots.txt"
RP.set_url(robot)                    # Set robots.txt link

RP.read()                            # Open robots.txt file
rrate = RP.request_rate("*")         # Return requests per second rate (requests, seconds)
print(RP.crawl_delay("*"))           # Returns crawl delay

RP.can_fetch("*", URL)               # Returns bool if can crawl that url

None


True

In [6]:
# Check if robots.txt exists -> doesn't work for our second link
def does_url_exist(url):
    try: 
        r = requests.head(url)
        if r.status_code < 400:
            print(1)
            return True
        else:
            print(0)
            return False
    except requests.exceptions.RequestException as e:
        print(e)
        # handle your exception

## Function bringing it all together 

Pseudocode: Crawler (nb_max_url, [URL], Domains, Links):

Get robots.txt url + domain 
Save domain, robots.txt url and crawl delay if don't already have it

If can fetch: 

    If domain in domains & time crawled > 0:
        Wait max(0, crawl delay - (current time - time crawled))
    
    Crawl: save contents 
    Update time crawled
    Save links if we still need more
    Save in matrix domains line: domain, crawl delay (20ms if none), time crawled

If haven't crawled enough links: repeat
    
Links = list of links we can crawl
Domains: [domain, robots.txt link, crawl delay, time last crawled]
(several links can have same domain)

Auxiliary functions: extract up to M links from a page, new filename, save a link (all checked individually, work)

To be able to wait for crawler delay time, need functions get current time, wait a certain time (need to know units of said time!!!)


In [24]:
domains = []
robots = []
delays = []
time_crawled = []

In [25]:
# Method with D as list of lists (domains, robots, delay, time_crawled)

def Crawler_list (D = [domains, robots, delays, time_crawled], Links=[URL], pointer=0, fname = '0000', max_l=max_links, max_c = max_cont, conv = [3, 1]):
    
    counter = len(Links)
    print("Nb links in Links: ", counter)
    print("Pointer: ", pointer)
    
    # Issue with 2nd link 
    if (pointer == 1):
        pointer +=1
        
    else:
    
        url = Links[pointer]
        already_crawled = True
        
        # Get url domain
        URL_parsed = parse.urlparse(url)
        dom = URL_parsed.netloc
        print("Domain: ", dom)

        # If we have not already tried to crawl in the domain
        if not (dom in domains):
            
            print("New_domain")
            already_crawled = False        
            # Get robots.txt link
            r = URL_parsed.scheme + "://" + dom + "/robots.txt"
            print("Robots.txt link: ", r)

            domains.append(dom)
            robots.append(r)
            delays.append(20)
            time_crawled.append(-1)


        i = domains.index(dom)
        print("Domain index: ", i)
        r = robots[i]
        print("Robots.txt link: ", r)

        # Access robots.txt
        RP = rp.RobotFileParser()         
        RP.set_url(r)                    
        RP.read()

        # Get crawl delay (if it is given and we don't already have it)
        if ((not already_crawled) & (RP.crawl_delay("*") != None)):
            robots.pop(i) 
            robots.insert(i, RP.crawl_delay("*"))

        # If we can crawl the file
        if (RP.can_fetch("*", URL)):

            # Wait appropriate time if we have already crawled the domain
            #if (already_crawled):
                #wait = max(0, crawl delay - (current time - time crawled))
                #if (wait >0): time.sleep(wait)

            # Save contents of the link
            save_link(url, fname)
            print("We are saving a link from domain ", dom)
            # time_crawled.pop(i)
            # time_crawled = time.time()                  # update time we last crawled the domain

            # Prepare filename for next page
            pointer += 1
            fname, conv = new_filename(pointer, fname, conv)

            # If we do not have enough links, save those referenced in the page
            if len(Links) < max_l:
                Links, counter = get_all_links(url, counter, Links, max_l)
                
        # If we can't crawl the file, we still move on to the next one
        else:
            pointer += 1
    
    print(D)
    print(Links)
    
    # If we have not saved enough links: repeat
    if (pointer < max_c):
        D2, Links2 = Crawler_list (D, Links, pointer, fname, max(0, max_l - counter), max_c, conv)
    
    return (D2, Links2)           

In [26]:
D, Links = Crawler_list()
# RecursionError: maximum recursion depth exceeded while calling a Python object


Nb links in Links:  1
Pointer:  0
Domain:  en.wikipedia.org
New_domain
Robots.txt link:  https://en.wikipedia.org/robots.txt
Domain index:  0
Robots.txt link:  https://en.wikipedia.org/robots.txt
We are saving a link from domain  en.wikipedia.org
[['en.wikipedia.org'], ['https://en.wikipedia.org/robots.txt'], [20], [-1]]
['https://en.wikipedia.org/wiki/Chocolate', 'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate', 'http://www.eluniversal.com.mx/notas/526113.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html', 'http://www.bartleby.com/61/68/C0316800.html', 'http://antiquity.ac.uk/projgall/powis/index.html', 'http://news.sciencemag.org/2013/01/earliest-evidence-chocolate-north-america', 'http://www.museum.upenn.edu/new/news/fullrelease.php?which=306', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican5.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamer

[['en.wikipedia.org', 'www.eluniversal.com.mx', 'archive.fieldmuseum.org'], ['https://en.wikipedia.org/robots.txt', 'http://www.eluniversal.com.mx/robots.txt', 'http://archive.fieldmuseum.org/robots.txt'], [20, 20, 20], [-1, -1, -1]]
['https://en.wikipedia.org/wiki/Chocolate', 'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate', 'http://www.eluniversal.com.mx/notas/526113.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican7.html', 'http://www.bartleby.com/61/68/C0316800.html', 'http://antiquity.ac.uk/projgall/powis/index.html', 'http://news.sciencemag.org/2013/01/earliest-evidence-chocolate-north-america', 'http://www.museum.upenn.edu/new/news/fullrelease.php?which=306', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican5.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican3.html', 'http://archive.fieldmuseum.org/Chocolate/history_mesoamerican4.html', 'http

AttributeError: 'NoneType' object has no attribute 'delay'

In [23]:
# Test list methods on Domains: [domains, robot_links, delay, time]

domains = [1, 2, 3, 5]
robot_links = ["aba", "baa", "daa", "ett"]
delay = [20, 20, 20, 50]
time = [1, 2.3, 2.4, 3]

D = [domains, robot_links, delay, time]
print(D)

dom = 2

# pop then insert method done directly in list and D
if (dom in domains):
    i = domains.index(dom)
    
    #robot_links[i] = "g"
    robot_links.pop(i)
    robot_links.insert(i, "gg")
    print(robot_links)
    
    time.pop(i)
    time.insert(i, 4)
    print(time)
    
# append done directly in domains and D
else:
    i = len(domains)
    domains.append(dom)
    robot_links.append("c")
    delay.append(20)
    time.append(-1)

print(D)

[[1, 2, 3, 5], ['aba', 'baa', 'daa', 'ett'], [20, 20, 20, 50], [1, 2.3, 2.4, 3]]
['aba', 'gg', 'daa', 'ett']
[1, 4, 2.4, 3]
[[1, 2, 3, 5], ['aba', 'gg', 'daa', 'ett'], [20, 20, 20, 50], [1, 4, 2.4, 3]]


In [25]:
# Test list methods on Domains: [domain 1, domain 2,...] -> Obsolete

D = [[1, "a", 20, 1], [2, "b", 20, 2.3], [5, "d", 30, 2.4]]

#i= None
j = 0
#while (not i):
#    i = D[j].index(5)
#    j += 1

while (5 not in D[j]):
    j += 1

print(j)

D2 = D[][3]
print(D2)

SyntaxError: invalid syntax (<ipython-input-25-ea69e73f2215>, line 16)

In [127]:
# Method with Domains as a pandas dataframe

def Crawler (Domains = pd.DataFrame({"domains": [],"robot_links": [],"crawl_delay": [],"time": []}), Links=[URL], pointer=0, fname = '0000', max_l=max_links, max_c = max_cont, conv = [3, 1]):
          
    url = Links[pointer]
    already_crawled = True
    counter = len(Links)
    
    if (pointer != 1):
    
        # Get url domain
        URL_parsed = parse.urlparse(url)
        dom = URL_parsed.netloc

        # If we have not already tried to crawl in the domain
        if (not dom in Domains["domains"]):
            already_crawled = False
            # Get robots.txt link
            robot = URL_parsed.scheme + "://" + dom + "/robots.txt"

            D = pd.DataFrame ({
                "domains": [dom],
                "robot_links": [robot],
                "crawl_delay": [20],
                "time": [-1]
            })        
            Domains = pd.concat([Domains, D])
            Domains.reset_index(drop=True, inplace=True)
            print(Domains.head())

        # Set i the index of the row containing our current domain
        i = Domains.index[Domains["domains"] == dom].tolist()[0]

        robot = Domains.loc[i, "robot_links"]
        print(robot)

        # If robots.txt exists:
        if (does_url_exist(robot)):

            # Access robots.txt
            RP = rp.RobotFileParser()         
            RP.set_url(robot)  
            RP.read()

            # Get crawl delay (if it is given and we don't already have it)
            if (not already_crawled) & (RP.crawl_delay("*") != None):
                Domains.loc[i, "crawl_delay"] = RP.crawl_delay("*")

            # If we can crawl the file
            if (RP.can_fetch("*", URL)):

                # Wait appropriate time if we have already crawled the domain
                #if (already_crawled):
                #    Wait max(0, crawl delay - (current time - time crawled))
                # Use time.sleep(time to wait), first import time
                # time.time() = get_cpu time now

                # Save contents of the link
                save_link(url, fname)
                # Domains.loc[i, "time"] = current_time                  # update time we last crawled the domain

                # Prepare filename for next page
                pointer += 1
                fname, conv = new_filename(pointer, fname, conv)

                # If we do not have enough links, save those referencec in the page
                if len(Links) < max_l:
                    Links, counter = get_all_links(url, counter, Links, max_l)

                print(pointer)
                
    else: 
        pointer +=1
    
    # If we have not saved enough links: repeat
    if (pointer < max_c):
        Domains, Links = Crawler (Domains, Links, pointer, fname, max(0, max_l - counter), (max_c - pointer), conv)
        
    return (Domains, Links)           

In [128]:
Crawler()

            domains                          robot_links  crawl_delay  time
0  en.wikipedia.org  https://en.wikipedia.org/robots.txt         20.0  -1.0
https://en.wikipedia.org/robots.txt
1
1
            domains                          robot_links  crawl_delay  time
0  en.wikipedia.org  https://en.wikipedia.org/robots.txt         20.0  -1.0
1      www.pnas.org       http://www.pnas.org/robots.txt         20.0  -1.0
http://www.pnas.org/robots.txt
1
3
                  domains                               robot_links  \
0        en.wikipedia.org       https://en.wikipedia.org/robots.txt   
1            www.pnas.org            http://www.pnas.org/robots.txt   
2  www.eluniversal.com.mx  http://www.eluniversal.com.mx/robots.txt   

   crawl_delay  time  
0         20.0  -1.0  
1         10.0  -1.0  
2         20.0  -1.0  
http://www.eluniversal.com.mx/robots.txt
1
                  domains                               robot_links  \
0        en.wikipedia.org       https://en.wikipedia.o

(                  domains                               robot_links  \
 0        en.wikipedia.org       https://en.wikipedia.org/robots.txt   
 1            www.pnas.org            http://www.pnas.org/robots.txt   
 2  www.eluniversal.com.mx  http://www.eluniversal.com.mx/robots.txt   
 3  www.eluniversal.com.mx  http://www.eluniversal.com.mx/robots.txt   
 4  www.eluniversal.com.mx  http://www.eluniversal.com.mx/robots.txt   
 5  www.eluniversal.com.mx  http://www.eluniversal.com.mx/robots.txt   
 6  www.eluniversal.com.mx  http://www.eluniversal.com.mx/robots.txt   
 
    crawl_delay  time  
 0         20.0  -1.0  
 1         10.0  -1.0  
 2         20.0  -1.0  
 3         20.0  -1.0  
 4         20.0  -1.0  
 5         20.0  -1.0  
 6         20.0  -1.0  ,
 ['https://en.wikipedia.org/wiki/Chocolate',
  'http://ndb.nal.usda.gov/ndb/foods/show/6153?fgcd=&manu=&lfacet=&format=&count=&max=35&offset=&sort=&qlookup=Candies%2C+milk+chocolate',
  'http://www.pnas.org/content/108/21/8595',


In [83]:
# Test dataframe methods on Domains 

D = pd.DataFrame ({
    "domains": [1, 2, 3, 5],
    "robots_link": ["a", "b", "d", "e"],
    "crawl_delay": [20, 20, 20, 50],
    "time": [1, 2.3, 2.4, 3]
})

print(D.head())

#if 10 not in D["domains"]:
#    pd.melt(D, [10, "g", 20, 3.2]) #=> Melt only with df 
#print(D.head())

dom = 3

curr = D.loc[D["domains"] == 3]   # Makes copy of row!!! Can't modif that

#i = D[D["domains"] == 3].index
#print(i)
#D[i, 2] = 10
#print(D.head())

#r = D[D["domains"] == dom]
#print(r)
#print(r["time"])

r = D[D["domains"] == dom]["robots_link"]
print(r)

D[D["domains"] == dom]["robots_link"] = "h"
print(D[D["domains"] == dom]["robots_link"])

i = D.index[D["domains"] == dom].tolist()[0]
print(i)

D.loc[i, "robots_link"] = "i"
print(D.loc[i, "robots_link"])
print(D.head())


   domains robots_link  crawl_delay  time
0        1           a           20   1.0
1        2           b           20   2.3
2        3           d           20   2.4
3        5           e           50   3.0
2    d
Name: robots_link, dtype: object
2    d
Name: robots_link, dtype: object
2
i
   domains robots_link  crawl_delay  time
0        1           a           20   1.0
1        2           b           20   2.3
2        3           i           20   2.4
3        5           e           50   3.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KeyError: False