diff --git a/__pycache__/config.cpython-32.pyc b/__pycache__/config.cpython-32.pyc new file mode 100644 index 0000000..1845021 Binary files /dev/null and b/__pycache__/config.cpython-32.pyc differ diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc new file mode 100644 index 0000000..817c27a Binary files /dev/null and b/__pycache__/crawler.cpython-32.pyc differ diff --git a/config.py b/config.py new file mode 100644 index 0000000..24bf6da --- /dev/null +++ b/config.py @@ -0,0 +1,10 @@ +xml_header = """ + +""" +xml_footer = "" + +crawler_user_agent = 'Sitemap crawler' \ No newline at end of file diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..fb4a77c --- /dev/null +++ b/crawler.py @@ -0,0 +1,227 @@ +import config +import logging + +import re +from urllib.request import urlopen, Request +from urllib.robotparser import RobotFileParser +from urllib.parse import urlparse + +import os + +class Crawler(): + + # Variables + parserobots = False + output = None + report = False + + config = None + domain = "" + + exclude = [] + skipext = [] + drop = [] + + debug = False + + tocrawl = set([]) + crawled = set([]) + excluded = set([]) + # TODO also search for window.location={.*?} + linkregex = re.compile(b'') + + rp = None + response_code={} + nb_url=1 # Number of url. + nb_rp=0 # Number of url blocked by the robots.txt + nb_exclude=0 # Number of url excluded by extension or word + + output_file = None + + target_domain = "" + + def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False): + self.parserobots = parserobots + self.output = output + self.report = report + self.domain = domain + self.exclude = exclude + self.skipext = skipext + self.drop = drop + self.debug = debug + + if self.debug: + logging.basicConfig(level=logging.DEBUG) + + self.tocrawl = set([domain]) + + try: + self.target_domain = urlparse(domain)[1] + except: + raise ("Invalid domain") + + + if self.output: + try: + self.output_file = open(self.output, 'w') + except: + logging.debug ("Output file not available.") + exit(255) + + def run(self): + print (config.xml_header, file=self.output_file) + + logging.debug("Start the crawling process") + self.__crawling() + logging.debug("Crawling as reach the end of all found link") + + print (config.xml_footer, file=self.output_file) + + + def __crawling(self): + crawling = self.tocrawl.pop() + + url = urlparse(crawling) + self.crawled.add(crawling) + + try: + request = Request(crawling, headers={"User-Agent":config.crawler_user_agent}) + response = urlopen(request) + except Exception as e: + if hasattr(e,'code'): + if e.code in self.response_code: + self.response_code[e.code]+=1 + else: + self.response_code[e.code]=1 + logging.debug ("{1} ==> {0}".format(e, crawling)) + response.close() + return self.__continue_crawling() + + # Read the response + try: + msg = response.read() + if response.getcode() in self.response_code: + self.response_code[response.getcode()]+=1 + else: + self.response_code[response.getcode()]=1 + response.close() + except Exception as e: + logging.debug ("{1} ===> {0}".format(e, crawling)) + return self.__continue_crawling() + + + print (""+url.geturl()+"", file=self.output_file) + if self.output_file: + self.output_file.flush() + + # Found links + links = self.linkregex.findall(msg) + for link in links: + link = link.decode("utf-8") + #logging.debug("Found : {0}".format(link)) + if link.startswith('/'): + link = 'http://' + url[1] + link + elif link.startswith('#'): + link = 'http://' + url[1] + url[2] + link + elif not link.startswith('http'): + link = 'http://' + url[1] + '/' + link + + # Remove the anchor part if needed + if "#" in link: + link = link[:link.index('#')] + + # Drop attributes if needed + for toDrop in self.drop: + link=re.sub(toDrop,'',link) + + # Parse the url to get domain and file extension + parsed_link = urlparse(link) + domain_link = parsed_link.netloc + target_extension = os.path.splitext(parsed_link.path)[1][1:] + + if (link in self.crawled): + continue + if (link in self.tocrawl): + continue + if (link in self.excluded): + continue + if (domain_link != self.target_domain): + continue + if ("javascript" in link): + continue + + # Count one more URL + self.nb_url+=1 + + # Check if the navigation is allowed by the robots.txt + if (not self.can_fetch(link)): + self.exclude_link(link) + self.nb_rp+=1 + continue + + # Check if the current file extension is allowed or not. + if (target_extension in self.skipext): + self.exclude_link(link) + self.nb_exclude+=1 + continue + + # Check if the current url doesn't contain an excluded word + if (not self.exclude_url(link)): + self.exclude_link(link) + self.nb_exclude+=1 + continue + + self.tocrawl.add(link) + + return self.__continue_crawling() + + def __continue_crawling(self): + if self.tocrawl: + self.__crawling() + + def exclude_link(self,link): + if link not in self.excluded: + self.excluded.add(link) + + def checkRobots(self): + if self.domain[len(self.domain)-1] != "/": + self.domain += "/" + request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) + self.rp = RobotFileParser() + self.rp.set_url(self.domain+"robots.txt") + self.rp.read() + + def can_fetch(self, link): + try: + if self.parserobots: + if self.rp.can_fetch("*", link): + return True + else: + logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) + return False + + if not self.parserobots: + return True + + return True + except: + # On error continue! + logging.debug ("Error during parsing robots.txt") + return True + + def exclude_url(self, link): + for ex in self.exclude: + if ex in link: + return False + return True + + def make_report(self): + print ("Number of found URL : {0}".format(self.nb_url)) + print ("Number of link crawled : {0}".format(len(self.crawled))) + if self.parserobots: + print ("Number of link block by robots.txt : {0}".format(self.nb_rp)) + if self.skipext or self.exclude: + print ("Number of link exclude : {0}".format(self.nb_exclude)) + + for code in self.response_code: + print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code])) \ No newline at end of file diff --git a/main.py b/main.py index 2f09373..c6c38a6 100755 --- a/main.py +++ b/main.py @@ -1,44 +1,9 @@ -import re -from urllib.request import urlopen, Request -from urllib.robotparser import RobotFileParser -from urllib.parse import urlparse - import argparse import os -import time import json -import logging - -def can_fetch(parserobots, rp, link, debug=False): - try: - if parserobots: - if rp.can_fetch("*", link): - return True - else: - if debug: - logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) - return False - - if not parserobots: - return True - return True - except: - # On error continue! - if debug: - logging.debug ("Error during parsing robots.txt") - return True - - -def exclude_url(exclude, link): - if exclude: - for ex in exclude: - if ex in link: - return False - return True - else: - return True +import crawler # Gestion des parametres parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map') @@ -63,10 +28,7 @@ def exclude_url(exclude, link): config = json.load(config_data) config_data.close() except Exception as e: - if arg.debug: - logging.debug ("Bad or unavailable config file") config = {} - print(e) else: config = {} @@ -83,186 +45,10 @@ def exclude_url(exclude, link): dict_arg[argument] = config[argument] else: dict_arg[argument] = config[argument] - else: - logging.error ("Unknown flag in JSON") - -if arg.debug: - logging.basicConfig(level=logging.DEBUG) - logging.debug ("Configuration : ") - logging.debug (arg) - -output_file = None -if arg.output: - try: - output_file = open(arg.output, 'w') - except: - if not arg.debug: - logging.debug ("Output file not available.") - exit(255) - else: - logging.debug ("Continue without output file.") +del(dict_arg['config']) -if arg.debug or arg.report: - time_start = time.clock() - -tocrawl = set([arg.domain]) -crawled = set([]) -excluded = set([]) -# TODO also search for window.location={.*?} -linkregex = re.compile(b'') - -header = """ - -""" -footer = "" - -try: - target_domain = urlparse(arg.domain)[1] -except: - logging.debug ("Invalid domain") - -rp = None -if arg.parserobots: - if arg.domain[len(arg.domain)-1] != "/": - arg.domain += "/" - request = Request(arg.domain+"robots.txt", headers={"User-Agent":'Sitemap crawler'}) - rp = RobotFileParser() - rp.set_url(arg.domain+"robots.txt") - rp.read() - -response_code={} -nb_url=1 # Number of url. -nb_rp=0 # Number of url blocked by the robots.txt -nb_exclude=0 # Number of url excluded by extension or word -print (header, file=output_file) -while tocrawl: - crawling = tocrawl.pop() - - - url = urlparse(crawling) - crawled.add(crawling) - - try: - request = Request(crawling, headers={"User-Agent":'Sitemap crawler'}) - response = urlopen(request) - except Exception as e: - if hasattr(e,'code'): - if e.code in response_code: - response_code[e.code]+=1 - else: - response_code[e.code]=1 - #else: - # response_code['erreur']+=1 - if arg.debug: - logging.debug ("{1} ==> {0}".format(e, crawling)) - response.close() - continue - - # Read the response - try: - msg = response.read() - if response.getcode() in response_code: - response_code[response.getcode()]+=1 - else: - response_code[response.getcode()]=1 - response.close() - except Exception as e: - if arg.debug: - logging.debug ("{1} ===> {0}".format(e, crawling)) - continue - - - print (""+url.geturl()+"", file=output_file) - if output_file: - output_file.flush() - - # Found links - links = linkregex.findall(msg) - for link in links: - link = link.decode("utf-8") - if link.startswith('/'): - link = 'http://' + url[1] + link - elif link.startswith('#'): - link = 'http://' + url[1] + url[2] + link - elif not link.startswith('http'): - link = 'http://' + url[1] + '/' + link - - # Remove the anchor part if needed - if "#" in link: - link = link[:link.index('#')] - - # Drop attributes if needed - if arg.drop is not None: - for toDrop in arg.drop: - link=re.sub(toDrop,'',link) - - # Parse the url to get domain and file extension - parsed_link = urlparse(link) - domain_link = parsed_link.netloc - target_extension = os.path.splitext(parsed_link.path)[1][1:] - - if (link in crawled): - continue - if (link in tocrawl): - continue - if (link in excluded): - continue - if (domain_link != target_domain): - continue - if ("javascript" in link): - continue - - # Count one more URL - nb_url+=1 - - # Check if the navigation is allowed by the robots.txt - if (not can_fetch(arg.parserobots, rp, link, arg.debug)): - if link not in excluded: - excluded.add(link) - nb_rp+=1 - continue - - # Check if the current file extension is allowed or not. - if (target_extension in arg.skipext): - if link not in excluded: - excluded.add(link) - nb_exclude+=1 - continue - - # Check if the current url doesn't contain an excluded word - if (not exclude_url(arg.exclude, link)): - if link not in excluded: - excluded.add(link) - nb_exclude+=1 - continue - - tocrawl.add(link) -print (footer, file=output_file) - -if arg.debug or arg.report: - time_total = time.clock() - time_start - -if arg.debug: - logging.debug ("Number of found URL : {0}".format(nb_url)) - logging.debug ("Number of link crawled : {0}".format(len(crawled))) - logging.debug ("Duration : {0}s".format(time_total)) +crawl = crawler.Crawler(**dict_arg) +crawl.run() if arg.report: - print ("Number of found URL : {0}".format(nb_url)) - print ("Number of link crawled : {0}".format(len(crawled))) - if arg.parserobots: - print ("Number of link block by robots.txt : {0}".format(nb_rp)) - if arg.skipext or arg.exclude: - print ("Number of link exclude : {0}".format(nb_exclude)) - - for code in response_code: - print ("Nb Code HTTP {0} : {1}".format(code, response_code[code])) - - print ("Duration : {0}s".format(int(time_total))) - -if output_file: - output_file.close() + crawl.make_report() \ No newline at end of file