# webhunt

__general.py__

In [None]:
import os

#Each website you crawl is a separate project
def create_project_dir(directory):
    if not os.path.exists(directory):
        print('Creating project...' + directory)
        os.makedirs(directory)
        
create_project_dir('thevipulsharma')

In [67]:
# Create queue and crawled files (if not created)
def create_data_files(project_name, base_url):
    queue = project_name + '/queue.txt'
    crawled = project_name + '/crawled.txt'
    
    if not os.path.isfile(queue):
        write_file(queue, base_url) # Base url to begin with
        
    if not os.path.isfile(crawled):
        write_file(crawled, '')
        
# Create a new file
def write_file(path, data):
    f = open(path, 'w')
    f.write(data)
    f.close()
    
create_data_files('thevipulsharma', 'https://www.github.com/thevipulsharma/')

In [68]:
# Add data onto an existing file
def append_to_file(path, data):
    with open(path, 'a') as file:
        file.write(data + '\n')
        
# Delete the contents of a file
def delete_file_contents(path):
    #overriding the existing one with the same name
    with open(path, 'w'):
        pass
    
# Read a file and convert each line to set items
def file_to_set(file_name):
    results = set()
    with open(file_name, 'rt') as f:
        for line in f:
            results.add(line.replace('\n', '')) # Replace the newline character
            
        return results
    
# Iterate through a set, each item will be a new line in the file
def set_to_file(links, file):
    delete_file_contents(file)
    for link in sorted(links):
        append_to_file(file, link)

__link_finder.py__

In [69]:
from html.parser import HTMLParser
from urllib import parse

# Inherting from HTMLParser class
class LinkFinder(HTMLParser):
    
    def __init__(self, base_url, page_url):
        super().__init__()
        self.base_url = base_url
        self.page_url = page_url
        self.links = set()
        
    def handle_starttag(self, tag, attrs):
        # print(tag)
        if tag == 'a':
            for(attribute, value) in attrs:
                if attribute == 'href':
                    url = parse.urljoin(self.base_url, value)
                    self.links.add(url)
                    
    def page_links(self):
        return self.links
        
    def error(self, message):
        pass

'''
finder = LinkFinder()
finder.feed('<html><head><title>Test</title></head><body><h1>Parse me!</h1></body></html>')
'''

"\nfinder = LinkFinder()\nfinder.feed('<html><head><title>Test</title></head><body><h1>Parse me!</h1></body></html>')\n"

__spider.py__

In [70]:
from urllib.request import urlopen
# from link_finder import LinkFinder
# from general import *

class Spider:
    
    # Class variables (shared among all instances)
    project_name = ''
    base_url = ''
    domain_name = ''
    queue_file = ''
    crawled_file = ''
    queue = set()
    crawled = set()
    
    def __init__(self, project_name, base_url, domain_name):
        Spider.project_name = project_name
        Spider.base_url = base_url
        Spider.domain_name = domain_name
        Spider.queue_file = Spider.project_name + '/queue.txt'
        Spider.crawled_file = Spider.project_name + '/crawled.txt'
        self.boot()
        self.crawl_page('First Spider', Spider.base_url)
        
    @staticmethod
    def boot():
        create_project_dir(Spider.project_name)
        create_data_files(Spider.project_name, Spider.base_url)
        Spider.queue = file_to_set(Spider.queue_file)
        Spider.crawled = file_to_set(Spider.crawled_file)
        
    @staticmethod
    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + ' now crawling: ' + page_url)
            print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)
            Spider.crawled.add(page_url)
            Spider.update_files()
            
    @staticmethod
    def gather_links(page_url):
        html_string = ''
        try:
            response = urlopen(page_url)
            if response.getheader('Content-Type') == 'text/html; charset=utf-8':
                html_bytes = response.read()
                html_string = html_bytes.decode("utf-8")
                
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except:
            print('Error: can not crawl page!')
            return set()
        return finder.page_links()
    
    @staticmethod
    def add_links_to_queue(links):
        for url in links:
            if url in Spider.queue:
                continue
            if url in Spider.crawled:
                continue
            if Spider.domain_name not in url:
                continue
            Spider.queue.add(url)
            
    @staticmethod
    def update_files():
        set_to_file(Spider.queue, Spider.queue_file)
        set_to_file(Spider.crawled, Spider.crawled_file)

__domain.py__

In [None]:
from urllib.parse import urlparse

# Get domain name (example.com)
def get_domain_name(url):
    try:
        results = get_sub_domain_name(url).split('.')
        return results[-2] + '.' + results[-1]
    except:
        return ''

# Get sub domain name (name.example.com)
def get_sub_domain_name(url):
    try:
        return urlparse(url).netloc
    except:
        return ''
    
print(get_domain_name('https://name.example.com'))

__main.py__

In [None]:
import threading
from queue import Queue
# from spider import Spider
# from domain import *
# from general import *

PROJECT_NAME = 'thevipulsharma'
HOMEPAGE = 'https://www.github.com/thevipulsharma/'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 8 #depends on the os and some other factors as well
queue = Queue()

# Creating worker threads (will die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target = work)
        t.daemon = True # Die as soon as the main exists
        t.start()
        
# Do the next job in the queue
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

# Each queued link is new job
def create_jobs():
    for link in file_to_set(QUEUE_FILE):
        queue.put(link)
    queue.join()
    crawl()

# Check if there are items in the queue, if so crawl them
def crawl():
    queued_links = file_to_set(QUEUE_FILE)
    if len(queued_links) > 0:
        print(str(len(queued_links)) + ' links in the queue')
        create_jobs()

Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)
create_workers()
crawl()