diff --git a/__pycache__/config.cpython-32.pyc b/__pycache__/config.cpython-32.pyc
new file mode 100644
index 0000000..1845021
Binary files /dev/null and b/__pycache__/config.cpython-32.pyc differ
diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc
new file mode 100644
index 0000000..817c27a
Binary files /dev/null and b/__pycache__/crawler.cpython-32.pyc differ
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..24bf6da
--- /dev/null
+++ b/config.py
@@ -0,0 +1,10 @@
+xml_header = """
+
+"""
+xml_footer = ""
+
+crawler_user_agent = 'Sitemap crawler'
\ No newline at end of file
diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..fb4a77c
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,227 @@
+import config
+import logging
+
+import re
+from urllib.request import urlopen, Request
+from urllib.robotparser import RobotFileParser
+from urllib.parse import urlparse
+
+import os
+
+class Crawler():
+
+ # Variables
+ parserobots = False
+ output = None
+ report = False
+
+ config = None
+ domain = ""
+
+ exclude = []
+ skipext = []
+ drop = []
+
+ debug = False
+
+ tocrawl = set([])
+ crawled = set([])
+ excluded = set([])
+ # TODO also search for window.location={.*?}
+ linkregex = re.compile(b'')
+
+ rp = None
+ response_code={}
+ nb_url=1 # Number of url.
+ nb_rp=0 # Number of url blocked by the robots.txt
+ nb_exclude=0 # Number of url excluded by extension or word
+
+ output_file = None
+
+ target_domain = ""
+
+ def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
+ self.parserobots = parserobots
+ self.output = output
+ self.report = report
+ self.domain = domain
+ self.exclude = exclude
+ self.skipext = skipext
+ self.drop = drop
+ self.debug = debug
+
+ if self.debug:
+ logging.basicConfig(level=logging.DEBUG)
+
+ self.tocrawl = set([domain])
+
+ try:
+ self.target_domain = urlparse(domain)[1]
+ except:
+ raise ("Invalid domain")
+
+
+ if self.output:
+ try:
+ self.output_file = open(self.output, 'w')
+ except:
+ logging.debug ("Output file not available.")
+ exit(255)
+
+ def run(self):
+ print (config.xml_header, file=self.output_file)
+
+ logging.debug("Start the crawling process")
+ self.__crawling()
+ logging.debug("Crawling as reach the end of all found link")
+
+ print (config.xml_footer, file=self.output_file)
+
+
+ def __crawling(self):
+ crawling = self.tocrawl.pop()
+
+ url = urlparse(crawling)
+ self.crawled.add(crawling)
+
+ try:
+ request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
+ response = urlopen(request)
+ except Exception as e:
+ if hasattr(e,'code'):
+ if e.code in self.response_code:
+ self.response_code[e.code]+=1
+ else:
+ self.response_code[e.code]=1
+ logging.debug ("{1} ==> {0}".format(e, crawling))
+ response.close()
+ return self.__continue_crawling()
+
+ # Read the response
+ try:
+ msg = response.read()
+ if response.getcode() in self.response_code:
+ self.response_code[response.getcode()]+=1
+ else:
+ self.response_code[response.getcode()]=1
+ response.close()
+ except Exception as e:
+ logging.debug ("{1} ===> {0}".format(e, crawling))
+ return self.__continue_crawling()
+
+
+ print (""+url.geturl()+"", file=self.output_file)
+ if self.output_file:
+ self.output_file.flush()
+
+ # Found links
+ links = self.linkregex.findall(msg)
+ for link in links:
+ link = link.decode("utf-8")
+ #logging.debug("Found : {0}".format(link))
+ if link.startswith('/'):
+ link = 'http://' + url[1] + link
+ elif link.startswith('#'):
+ link = 'http://' + url[1] + url[2] + link
+ elif not link.startswith('http'):
+ link = 'http://' + url[1] + '/' + link
+
+ # Remove the anchor part if needed
+ if "#" in link:
+ link = link[:link.index('#')]
+
+ # Drop attributes if needed
+ for toDrop in self.drop:
+ link=re.sub(toDrop,'',link)
+
+ # Parse the url to get domain and file extension
+ parsed_link = urlparse(link)
+ domain_link = parsed_link.netloc
+ target_extension = os.path.splitext(parsed_link.path)[1][1:]
+
+ if (link in self.crawled):
+ continue
+ if (link in self.tocrawl):
+ continue
+ if (link in self.excluded):
+ continue
+ if (domain_link != self.target_domain):
+ continue
+ if ("javascript" in link):
+ continue
+
+ # Count one more URL
+ self.nb_url+=1
+
+ # Check if the navigation is allowed by the robots.txt
+ if (not self.can_fetch(link)):
+ self.exclude_link(link)
+ self.nb_rp+=1
+ continue
+
+ # Check if the current file extension is allowed or not.
+ if (target_extension in self.skipext):
+ self.exclude_link(link)
+ self.nb_exclude+=1
+ continue
+
+ # Check if the current url doesn't contain an excluded word
+ if (not self.exclude_url(link)):
+ self.exclude_link(link)
+ self.nb_exclude+=1
+ continue
+
+ self.tocrawl.add(link)
+
+ return self.__continue_crawling()
+
+ def __continue_crawling(self):
+ if self.tocrawl:
+ self.__crawling()
+
+ def exclude_link(self,link):
+ if link not in self.excluded:
+ self.excluded.add(link)
+
+ def checkRobots(self):
+ if self.domain[len(self.domain)-1] != "/":
+ self.domain += "/"
+ request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
+ self.rp = RobotFileParser()
+ self.rp.set_url(self.domain+"robots.txt")
+ self.rp.read()
+
+ def can_fetch(self, link):
+ try:
+ if self.parserobots:
+ if self.rp.can_fetch("*", link):
+ return True
+ else:
+ logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
+ return False
+
+ if not self.parserobots:
+ return True
+
+ return True
+ except:
+ # On error continue!
+ logging.debug ("Error during parsing robots.txt")
+ return True
+
+ def exclude_url(self, link):
+ for ex in self.exclude:
+ if ex in link:
+ return False
+ return True
+
+ def make_report(self):
+ print ("Number of found URL : {0}".format(self.nb_url))
+ print ("Number of link crawled : {0}".format(len(self.crawled)))
+ if self.parserobots:
+ print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
+ if self.skipext or self.exclude:
+ print ("Number of link exclude : {0}".format(self.nb_exclude))
+
+ for code in self.response_code:
+ print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
\ No newline at end of file
diff --git a/main.py b/main.py
index 2f09373..c6c38a6 100755
--- a/main.py
+++ b/main.py
@@ -1,44 +1,9 @@
-import re
-from urllib.request import urlopen, Request
-from urllib.robotparser import RobotFileParser
-from urllib.parse import urlparse
-
import argparse
import os
-import time
import json
-import logging
-
-def can_fetch(parserobots, rp, link, debug=False):
- try:
- if parserobots:
- if rp.can_fetch("*", link):
- return True
- else:
- if debug:
- logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
- return False
-
- if not parserobots:
- return True
- return True
- except:
- # On error continue!
- if debug:
- logging.debug ("Error during parsing robots.txt")
- return True
-
-
-def exclude_url(exclude, link):
- if exclude:
- for ex in exclude:
- if ex in link:
- return False
- return True
- else:
- return True
+import crawler
# Gestion des parametres
parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
@@ -63,10 +28,7 @@ def exclude_url(exclude, link):
config = json.load(config_data)
config_data.close()
except Exception as e:
- if arg.debug:
- logging.debug ("Bad or unavailable config file")
config = {}
- print(e)
else:
config = {}
@@ -83,186 +45,10 @@ def exclude_url(exclude, link):
dict_arg[argument] = config[argument]
else:
dict_arg[argument] = config[argument]
- else:
- logging.error ("Unknown flag in JSON")
-
-if arg.debug:
- logging.basicConfig(level=logging.DEBUG)
- logging.debug ("Configuration : ")
- logging.debug (arg)
-
-output_file = None
-if arg.output:
- try:
- output_file = open(arg.output, 'w')
- except:
- if not arg.debug:
- logging.debug ("Output file not available.")
- exit(255)
- else:
- logging.debug ("Continue without output file.")
+del(dict_arg['config'])
-if arg.debug or arg.report:
- time_start = time.clock()
-
-tocrawl = set([arg.domain])
-crawled = set([])
-excluded = set([])
-# TODO also search for window.location={.*?}
-linkregex = re.compile(b'')
-
-header = """
-
-"""
-footer = ""
-
-try:
- target_domain = urlparse(arg.domain)[1]
-except:
- logging.debug ("Invalid domain")
-
-rp = None
-if arg.parserobots:
- if arg.domain[len(arg.domain)-1] != "/":
- arg.domain += "/"
- request = Request(arg.domain+"robots.txt", headers={"User-Agent":'Sitemap crawler'})
- rp = RobotFileParser()
- rp.set_url(arg.domain+"robots.txt")
- rp.read()
-
-response_code={}
-nb_url=1 # Number of url.
-nb_rp=0 # Number of url blocked by the robots.txt
-nb_exclude=0 # Number of url excluded by extension or word
-print (header, file=output_file)
-while tocrawl:
- crawling = tocrawl.pop()
-
-
- url = urlparse(crawling)
- crawled.add(crawling)
-
- try:
- request = Request(crawling, headers={"User-Agent":'Sitemap crawler'})
- response = urlopen(request)
- except Exception as e:
- if hasattr(e,'code'):
- if e.code in response_code:
- response_code[e.code]+=1
- else:
- response_code[e.code]=1
- #else:
- # response_code['erreur']+=1
- if arg.debug:
- logging.debug ("{1} ==> {0}".format(e, crawling))
- response.close()
- continue
-
- # Read the response
- try:
- msg = response.read()
- if response.getcode() in response_code:
- response_code[response.getcode()]+=1
- else:
- response_code[response.getcode()]=1
- response.close()
- except Exception as e:
- if arg.debug:
- logging.debug ("{1} ===> {0}".format(e, crawling))
- continue
-
-
- print (""+url.geturl()+"", file=output_file)
- if output_file:
- output_file.flush()
-
- # Found links
- links = linkregex.findall(msg)
- for link in links:
- link = link.decode("utf-8")
- if link.startswith('/'):
- link = 'http://' + url[1] + link
- elif link.startswith('#'):
- link = 'http://' + url[1] + url[2] + link
- elif not link.startswith('http'):
- link = 'http://' + url[1] + '/' + link
-
- # Remove the anchor part if needed
- if "#" in link:
- link = link[:link.index('#')]
-
- # Drop attributes if needed
- if arg.drop is not None:
- for toDrop in arg.drop:
- link=re.sub(toDrop,'',link)
-
- # Parse the url to get domain and file extension
- parsed_link = urlparse(link)
- domain_link = parsed_link.netloc
- target_extension = os.path.splitext(parsed_link.path)[1][1:]
-
- if (link in crawled):
- continue
- if (link in tocrawl):
- continue
- if (link in excluded):
- continue
- if (domain_link != target_domain):
- continue
- if ("javascript" in link):
- continue
-
- # Count one more URL
- nb_url+=1
-
- # Check if the navigation is allowed by the robots.txt
- if (not can_fetch(arg.parserobots, rp, link, arg.debug)):
- if link not in excluded:
- excluded.add(link)
- nb_rp+=1
- continue
-
- # Check if the current file extension is allowed or not.
- if (target_extension in arg.skipext):
- if link not in excluded:
- excluded.add(link)
- nb_exclude+=1
- continue
-
- # Check if the current url doesn't contain an excluded word
- if (not exclude_url(arg.exclude, link)):
- if link not in excluded:
- excluded.add(link)
- nb_exclude+=1
- continue
-
- tocrawl.add(link)
-print (footer, file=output_file)
-
-if arg.debug or arg.report:
- time_total = time.clock() - time_start
-
-if arg.debug:
- logging.debug ("Number of found URL : {0}".format(nb_url))
- logging.debug ("Number of link crawled : {0}".format(len(crawled)))
- logging.debug ("Duration : {0}s".format(time_total))
+crawl = crawler.Crawler(**dict_arg)
+crawl.run()
if arg.report:
- print ("Number of found URL : {0}".format(nb_url))
- print ("Number of link crawled : {0}".format(len(crawled)))
- if arg.parserobots:
- print ("Number of link block by robots.txt : {0}".format(nb_rp))
- if arg.skipext or arg.exclude:
- print ("Number of link exclude : {0}".format(nb_exclude))
-
- for code in response_code:
- print ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
-
- print ("Duration : {0}s".format(int(time_total)))
-
-if output_file:
- output_file.close()
+ crawl.make_report()
\ No newline at end of file