# **CRAWLER** 

Intializing all libraries needed and parameters for crawler

In [None]:
import os 
import random
import sys
from queue import Queue
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
from html import escape
import urllib.robotparser
import threading
import time
import datetime
from heapq import heapify, heappush, heappop

In [None]:
# Crawler Parameters
THREADS = 3
BACKQUEUES= THREADS *3
FRONTQUEUES= 5
WAITTIME= 15 ; # wait 15 seconds before fetching URLS from 
CRAWLS = 1000

__version__ = "0.2"
USAGE = "%prog [options] <url>"
VERSION = "%prog v" + __version__
AGENT = "%s/%s" % (__name__, __version__)

out_urls = [] #urls we get from a page
visited_urls = [] #urls we have visited
lock = threading.Lock()

# **FRONTIER**

Defining Frontier functions and logic for front queue and back queue operations 

In [None]:
def prioritizer(URL,f):
    """
    Take URL and returns priority from 1 to F
    Right now it like a stub function. 
    It will return a random number from 1 to f for given inputs. 
    """
    return random.randint(1, f)


def addSecs(tm, secs):
    fulldate = datetime.datetime(100, 1, 1, tm.hour, tm.minute, tm.second)
    fulldate = fulldate + datetime.timedelta(seconds=secs)
    return fulldate.time()


class frontier:
    def __init__(self):
        self.seedURLs = ["https://docs.oracle.com/en/", "https://www.oracle.com/corporate/", "https://en.wikipedia.org/wiki/Machine_learning", 
                        "https://www.csie.ntu.edu.tw/~cjlin/libsvm/index.html", "https://docs.oracle.com/middleware/jet210/jet/index.html", 
                        "https://en.wikipedia.org/w/api.php", "https://en.wikipedia.org/api/", "https://en.wikipedia.org/wiki/Weka_(machine_learning)"]
        self.front_queue = [Queue() for i in range(FRONTQUEUES)]
        self.back_queue = [Queue() for i in range(BACKQUEUES)] 
        self.heap = []
        heapify(self.heap)
        self.curr_t = datetime.datetime.now().time()
        for i in range(BACKQUEUES):
          heappush(self.heap, (self.curr_t, i))
    
    def exists_in_fq(self, url, fq):
      for q_element in list(fq.queue):
        if q_element == url:
          return True
      return False
        

    def exists_in_bq(self, url, bq):
      for q_element in list(bq.queue):
        if q_element == url:
          return True
      return False

                
    def add_seed_urls(self):
        for url in self.seedURLs:
            i = prioritizer(url, FRONTQUEUES)
            self.front_queue[i-1].put(url)

        for i in range(FRONTQUEUES):
            self.add_to_backqueue() # fill backqueues
            

    def bq_empty(self):
      for i in range(BACKQUEUES):
        if self.back_queue[i].empty():
          return True
      return False

            
    def add_URLs(self, URLs_list):
        if len(URLs_list) == 0:
          return
          
        for url in URLs_list:
            i = prioritizer(url, FRONTQUEUES)
            if not self.exists_in_fq(url, self.front_queue[i-1]):
              self.front_queue[i-1].put(url)
              if self.bq_empty(): # while backqueue empty -> add to bq
                self.add_to_backqueue() # add some n links to bqs
                #print("adding to bq")


    def get_URLs(self):

      current_time = datetime.datetime.now().time() #getting current time
      min_bq = heappop(self.heap)
      last_fetched_bq = addSecs(min_bq[0], 15)
      

      while current_time < last_fetched_bq: # as soon as curr time > = last fetched time + 15 -> get data from bq
            current_time = datetime.datetime.now().time()
            #print(current_time)

      URL = self.back_queue[min_bq[1]].get() # get url 
      
      time = datetime.datetime.now().time() # adding current time to min heap
      print("Access Time: ", time)
      heappush(self.heap, (time, min_bq[1]))

      return URL
            
        
    def add_to_backqueue(self):
        
        fq_n = prioritizer(" ", FRONTQUEUES)  # random fq selected
        while self.front_queue[fq_n - 1].empty() == True: # if fq is empty -> check another fq 
            fq_n = prioritizer(" ", FRONTQUEUES) 

        url = self.front_queue[fq_n - 1].get() 
        
        url_split = url.split('/')

        if not is_valid_url(url):
          return
        
        added_to_bq = False
        
        while not added_to_bq:
            for i in range(0, BACKQUEUES):  # check for any bq which might have same url
                if not self.back_queue[i].empty():
                    url2 = self.back_queue[i].get()
                    url2_split = url2.split('/')
                    if url_split[2] == url2_split[2] and not self.exists_in_bq(url, self.back_queue[i]):   # bq has url of that domain and url doesnt already exist
                        #print(url_split[2])
                        self.back_queue[i].put(url2)  # add url we dequeued
                        self.back_queue[i].put(url)  # add new url
                        added_to_bq = True
                        #print("existed in some bq")
                        break
                    else:
                      self.back_queue[i].put(url2)

            if not added_to_bq:
                # else add to any free bq
                for i in range(0, BACKQUEUES):
                    if self.back_queue[i].empty():  # if bq empty can simply put in url
                        self.back_queue[i].put(url)
                        added_to_bq = True
                        #print("bq empty")
                        break
            if not added_to_bq:
                self.front_queue[fq_n - 1].put(url)  # put url back in fq
                #print("back to fq")
                added_to_bq = True
                # no bq found break
        
        

# **FETCH URLS, PARSE AND FILTER**

Here we are fetching a url, with the help of urllib we are extracting the contents of the html page. 
Once the contents are fetched, we parse and retrive urls from it. Using html escape library, we get absolute URLs
Lastly, we check the robots.txt file under user agent * and only add links to out_urls if robots.txt allows   

In [None]:
class OpaqueDataException (Exception):
    def __init__(self, message, mimetype, url):
        Exception.__init__(self, message)
        self.mimetype=mimetype
        self.url=url

In [None]:
def is_valid_url(url):
  if "https" in url:
    return True
  return False


def fetch_URL(url):
  visited_urls.append(url)
  url_split = url.split('/')
  out_urls = []

  try:
    request = urllib.request.Request(url)
    handle = urllib.request.build_opener()
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url("https://" + url_split[2] + "/robots.txt")
    rp.read()
  except IOError:
    return None

  request.add_header("User-Agent", AGENT)
 
  if handle:
    try:
      data=handle.open(request)
      mime_type=data.info().get_content_type()
      url=data.geturl()
      if mime_type != "text/html":
          raise OpaqueDataException("Not interested in files of type %s" % mime_type,
                                    mime_type, url)
      content = data.read().decode("utf-8", errors="replace")
      soup = BeautifulSoup(content, "html.parser")
      tags = soup('a')
    except urllib.error.HTTPError as error:
      if error.code == 404:
          print(sys.stderr, "ERROR: %s -> %s" % (error, error.url))
      else:
          print(sys.stderr, "ERROR: %s" % error)
      tags = []
    except urllib.error.URLError as error:
      print(sys.stderr, "ERROR: %s" % error)
      tags = []
    except OpaqueDataException as error:
      print(sys.stderr, "Skipping %s, has type %s" % (error.url, error.mimetype))
      tags = []
    for tag in tags:
      href = tag.get("href")
      if href is not None:
        url = urllib.parse.urljoin(url, escape(href))
        if url not in out_urls and url not in visited_urls and rp.can_fetch("*", url) and is_valid_url(url):
          out_urls.append(url)
    return out_urls

# **RUN CRAWLER**

Intializing the frontier(front queue and back queue), loading seed urls, defining thread task and running crawler 

In [None]:
# frontiers defined
# fq and bq initialization 
url_frontier = frontier()
url_frontier.add_seed_urls()
url_frontier.add_to_backqueue()

In [None]:
# Theard task
def crawler_thread_task(lock):
  lock.acquire()
  url = url_frontier.get_URLs()
  print("Url Crawled:" , url)

  url_list = fetch_URL(url)
  url_frontier.add_URLs(url_list)

  #time.sleep(15) # wait time for crawler 
  lock.release()


In [None]:
# start the threads 

threads = []
for i in range(THREADS):
    threads.append(threading.Thread(target=crawler_thread_task, args=(lock,)))

while len(visited_urls) <= 1000:
  for i in range(THREADS):
    threads[i] = threading.Thread(target=crawler_thread_task, args=(lock,))

  for i in range(THREADS):
    threads[i].start()

  for i in range(THREADS):
    threads[i].join()

Access Time:  13:25:07.000051
Url Crawled: https://docs.oracle.com/en/
Access Time:  13:25:07.351333
Url Crawled: https://en.wikipedia.org/api/
Access Time:  13:25:07.461041
Url Crawled: https://www.oracle.com/corporate/
Access Time:  13:25:07.806973
Url Crawled: https://www.csie.ntu.edu.tw/~cjlin/libsvm/index.html
Access Time:  13:25:09.744653
Url Crawled: https://foundation.wikimedia.org/wiki/Developer_app_guidelines
Access Time:  13:25:10.094047
Url Crawled: https://developer.oracle.com/
Access Time:  13:25:10.596464
Url Crawled: https://profile.oracle.com/myprofile/account/create-account.jspx
Access Time:  13:25:11.074765
Url Crawled: https://academy.oracle.com/en/oa-web-overview.html
Access Time:  13:25:11.439627
Url Crawled: https://partner-finder.oracle.com/corporate/covid-19.html
<ipykernel.iostream.OutStream object at 0x7f076c8cba50> ERROR: HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Moved Tempo

## **End of Notebook**