From 3d380159e96bdb3f096def3d64b267d299f60478 Mon Sep 17 00:00:00 2001
From: Brosseau Valentin <c4software@gmail.com>
Date: Mon, 20 Aug 2012 22:08:21 +0200
Subject: [PATCH 1/3] Creation de la class pour crawler les sites

---
 __pycache__/config.cpython-32.pyc  | Bin 0 -> 519 bytes
 __pycache__/crawler.cpython-32.pyc | Bin 0 -> 7094 bytes
 config.py                          |  10 ++
 crawler.py                         | 226 +++++++++++++++++++++++++++++
 test.py                            |   3 +
 5 files changed, 239 insertions(+)
 create mode 100644 __pycache__/config.cpython-32.pyc
 create mode 100644 __pycache__/crawler.cpython-32.pyc
 create mode 100644 config.py
 create mode 100644 crawler.py
 create mode 100644 test.py
diff --git a/__pycache__/config.cpython-32.pyc b/__pycache__/config.cpython-32.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1845021097475be70224d817560b13a12be50285
GIT binary patch
literal 519
zcmb7B!A=4(5bc6N-Dpfac;58Hh3$eDCClQ;6B-jVF<i)|ElZQ7C7ssAtAF4hJM1Fj
zkx4q4H#4uDc`Yw4!>{kb3nvE_VEljwZqNxrX6OSl3y5fuS&N7^nRZXG<ono}jyJ0^
z`j8fs(IeWA6B<d)jZnH6(YM#f-W?rhVd`xO(uGlNNYOB&OXt=@#;U4{K{>g~*AN?9
zFyKoBz;;QN#P{@&IoyEy$F6Q`-XKZ(?EPsnt@%Br!R4CEJr~rlxq4!F?&^7u7-2U-
zv%wL(-!<Y52%3fd9HmQHqetei5KWuqBX0Aml=ivX-l`j-L!stNnG0$6aAAx?c#YLv
r|J;ML^SqGSbuml+T>Ta|dZCJV{pruJn68ZQrOZw-hV3tQf==rPW*UzD

literal 0
HcmV?d00001

diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0dbb2fae57bda0bfd2ba6e308171d79f1b21340
GIT binary patch
literal 7094
zcmcIpO>-Pa8Sa@`t#(%{t!yWfH?iZjosW%^cu6QoNGS&5kAx~pF_D6j4X&xxPOmhp
z*_qXRBvN99DjW`};s<cajpD$G15}C=#c$vO{~%SsfeTl7p0{VTD_MaYgsd68-Sc+$
z`_=FJbUUce*Z%g)m+tn})Q<!Ddw7zM@bd6$sX(djA#JIyRrK4cYYT0wz)_=;>Xy`K
zMs;VD#gUE*%4#&Ly0fNTQb9$Hs;XNxZOmL#-I{vCn^B*FzpNf$#yKvz)}F=kJ|4B&
zw(-u`&f*{%>rm*o^=_t<v~97!f!Ev7W|ZFX1AVs_CptDCDrVG}U463hhkvVhUdNMI
zXx&%pwuX{WY+iL8<<+SJ>9WwXRI*!9TX<LHU6VIN1y5b+c{U%wGz(L7V~So>`ivT$
zrD|k5C(}Mc1-p+b4_YtLT-{}PpO>#qS@bb30eSdVT*dsdSyG=?RQGXZtk=Cj1Hz2m
zi^`i7;tAzdgt(-<st_&Z)r7dLyg4DRaPqbG0!)$NQORR?t@K*ku^znk{xkQl+-qOI
z_T6XSf9A?P{Jr7RcH)*kEyy4KrTuT>N&b%4eTxQlLD)i|=?H;lNeDD&gg~<_1e&u#
zpji<@n!{NYZ>hARhE<x5L`^+VmK7?M&T+aeF(}uotNmq_&a)FHhE1F7T~Hh00tQ$x
z@6RmtUTCRz(Lc0Q{H5A&s<gq?(DX3O$h)XEcE98hE(h1oa6T(NBLNEPJ$!u^0>B?8
zOjolRs2%7JbRhQ0Xw4slAM^s>YkASA=Z6{X_F+cjzLllpENu-CsjV<dTfGl@e$d+t
z^mYH=_}o5=S2l;21Z~WyPO>uEkqi_v*5fEnh0N!l1%uw}2brg3dn7ylSns7-3H@Fi
zjYULHZ)RKaIf%Bl{BTRw+3Y2L|7H{p{4L?Sb^G;q-pH5-5}gWn8ujDehe0;Io3eQ@
zq`i1cr*19-*OL87>5a55)f6H0o}acEE7D4IF!+Be*~UUjUfFj*bw=9rd!231>%c2I
zO#QHP3s%{T_Bwa>)9onijN_;yTI#s?dwskwD!Z;9`l;(ahnbRVoY|@#{byILCHt)H
zSj(1U%{f)Pn|S7}KK;%g{-u3nm#l+G@oT9e@~3Q<p@S$x$d%L(na7rpV_+gDVsPF~
z9yH8IMCSBJiAJulDh3w2Gu+nTqqLQ7>y~H-r?;SLUnfaMeQwgR?Cd42Soiwd!m2~B
z6%AUwAZQJu3=OdCw8i}l^>OTnCi?m&Y-;G-8wKvR?s+;E$4>j7!C6Ec$D250pu4iZ
zj$OBqE-U65L{VC#aq7j8IW@O3y5lT-gY$Ah1^>-kHEZ5J^cds&3Lf#;020t0k}BYl
z10*A43rRi)k~ZMLA<~)jSo_bb6e$f^#ZpKKX;~79t-cK(A2`foNQv;}K?&*Ub5%;K
zYNw{+e<|w#wezC`mDZRdk+YDK;x9-Kk?yd1UG0=r{HvTc{WDD@ua}*b-7lEM443(`
z!aO;zh6~JML!>$p3Ue}!kSK?X5?&l~hSb`|dx#_aLX1ylkmZmq_kY5airPiBk)A=H
zBR!GkTUNei@u?6fM4dYnsQ9=tS$89R^HBHCVHxV)E(C_njtl(X#7rMy*+a=0K1%79
zQTmoq^(fgA)U`y$B^CcACx**pV<92Sc{OaZ7SSiOYA>=;j&9xCvJc=XQ~^|B)E#^?
zP1M{uHsfnHswdWjmd#}_E`uRlmm%Pp7oHy<;Q{Z$6mPrWZICzlWSSRi9L@=+pt4oO
z`M<V7z(*7eMPF9=Im#W^#x{If6uJoUA6Exel|G?{m(+dK&qa=S;82O(-`X3DU$mhV
zVy&izEp;%b(#vXiMGdb?&lP^9YIv}otO0dl16a%vSuQxc{}daVP9>c$9_@TmI`0&n
zCXSy{JC2Iq&pW*{oR*P2D%g-Nhfj0$mu=Um^h_>q@<#{EDW(NF_>Dtho|L^O+T<QB
z4cqzPw#r3>=wN+Tma$8dDgN=5Q(ViZxK>Po@y``fyvZrd;uPeYrLPj?q**W#Hii)x
z;%#uO*@063SEcNlY???$d<T&Bh3{{nbV+*eN1irVgSGbK7Y|ymy>_GZ@iz{#`e8fo
zV0v2ta=x(=MEx7`eWgQkrNiDywP0q>(!mR_2w3SD5z=)0YNsPTSA7BEC9rLyTi~L|
zHUxx8`msMw+s|_jruzaiIH1RXA5uHwKroFkNWRye;EbN<NneiX1=r-2g+;=!#%=~1
zxIM7}mncpH1QoaU`g)xDkvSV|_mW<k#)p;?tP<;F9EFK?X**f&;gUp{HHhL-FBK%z
z528e8kE3&{PDMly8K%iAr>ug~X;c6llKGF8N%TV6hw37k20@l=XVif;Hr88uPlOxz
zp@;fwFrJn<>B)rUJo(U1w?#rQfuAlBkVgl_+F-YAQ^tlmg|jn$i4x%)_b{0LNXCJm
z5+t&bgGcWKTt};fn=Un=oc&(t4xpxy03YJ9NL)ZpiizF=jWR%Unp3n{S49rNfL!{}
zk_;BELhr`rd?&IMu9Gk{>WvZuR%tY&7Aes<jR@@~=+blL6Xh^K)wCDU&haa3pu+^w
z(?;K^z((3&f@yw)R`LtZZ8hw=wFsbi!CJJ=S`9qQ_BnvXnsw1}th)86^%&X<=x?Iu
zGGOMSa~43dVK?xtVO_-2bn4axXHouEt*iJ_M(^aQTV<!=G%(va_8*}drq6z40(1yX
z`P{@K$3G?`IABS#8vu<dlc)e|a;Tm<kQclBWGH8j;0(@F#ct;?A#8hmAO4=}a519u
zeu29g_l#dr?lTZZIY4hiu)<mDd=rqH1?eSrm!Lh`C_D&c1c||S=yL!mN&@Hv>;Y2=
zpaa0e8BwNo0gzxl!Fy%_4#FaWI?#=j4*|uN6l>KaaZ1~fTx!mH*Hi3D?t+s`fH_BT
ze17tBK;l><L%x0zJ>k|w@<g$pakArXq)nOM%souu&go5hYTJf;w7;$UJ93$m`~l5`
z1x;Bk?1-y~=w+)<m-2^ynUBP^I25UUATJZTuM2T_yh02EMnA=BzIJ%Rf|Cqdr+47W
zwHyj!$+v`)+#6w2ZUO*a4IC#g`sE&B;zdbJ*RrQ6O>v;Y`3d#7<@pH@_g-srzcuCD
zOJKbn$5Gt!GMuJxD8m&M|4w+;@0b%8jWT@~K)vCjqp6r~S&y@?S=jFMf<zzlCud-y
zJT+$?5%5o#Jvon;!nwGLR93Q=tXZ6!OLoO_aP~TMD`T0DxRt`}UC!Nl>)w*$VGF;O
zptu_M?%HlmZD6~uwK;0_&bs`S@GMFY>#V3Ou}m`=p2Ju!beHhgd#7fCW3}(kkRz$!
zrI7vWR^@1#;(iyUe+L5b4Bc=q#fX;k8pD*mIN>1S$c-8>Bzn!SB$yc$ugg&6Co@!z
zG#r{o_pC|=GG|4_KNVJlwqdOnGw~F0U6^V(MO>A`E*8cK5{!lyM+QqZ^MmB>XK(<k
zgJiU<3Gr96(WZ`hhhXjx-g*0ntycw~UO4iV0cS2IutN+3jXxPK44XhOJ4ZLrSI?F^
zJ!e?rY@4l{z<jMY*VpgLp2R{g9<=Ek0a26dj2!z>j~wtYri#pJaHuyy7G|%?91p`F
zy0ZwtV~~gtsK7J=M^D&_M0}?jhojz(cFo1%U$K5d{DW%`AdxDt2JnWxXszC9Uq!Eg
zF4qlvBkj6UmRxrfd0D{bWHTL4Ox#7QQK$uloqSuFqxz}RHEkx+1&ZWk$1X1Ei{n?4
z98&m&J1e)iX9}4~hy7?Y<~1Y}6648$>^zM=<O_qE4cw)!#<RyfNY$rrxH!wv%TNka
zTpS7@$8}S7GEVxRCk(DORTht>etyl>mU_+hl(P*Aqx`OmeT2;Kx~CRcG{00l9dp^_
z6})Or8DO+*=MO1>Ui`Bg>@)w+SGF%-sMe}pwN$NC&sNKlvh&H>toq`KJYu*|ab6S8
zX}M(wJ_-EI>&0P#T<ALfpG0E!f|VEQrktMi-$rsi&RNZ2m5xM*X>e&Aqo&lJm{O=y
axNcrCTkQrxVX}Z1O;^WGUBt<uZvO{rKqJKf

literal 0
HcmV?d00001

diff --git a/config.py b/config.py
new file mode 100644
index 0000000..24bf6da
--- /dev/null
+++ b/config.py
@@ -0,0 +1,10 @@
+xml_header = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset
+      xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
+            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+"""
+xml_footer = "</urlset>"
+
+crawler_user_agent = 'Sitemap crawler'
\ No newline at end of file
diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..d4a3a1f
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,226 @@
+import config
+import logging
+
+import re
+from urllib.request import urlopen, Request
+from urllib.robotparser import RobotFileParser
+from urllib.parse import urlparse
+
+import os
+
+class Crawler():
+	
+	# Variables
+	parserobots = False
+	output 	= None
+	report 	= False
+
+	config 	= None
+	domain	= ""
+
+	exclude = []
+	skipext = []
+	drop    = []
+	
+	debug	= False
+
+	tocrawl = set([])
+	crawled = set([])
+	excluded = set([])
+	# TODO also search for window.location={.*?}
+	linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')
+
+	rp = None
+	response_code={}
+	nb_url=1 # Number of url.
+	nb_rp=0 # Number of url blocked by the robots.txt
+	nb_exclude=0 # Number of url excluded by extension or word
+	
+	output_file = None
+
+	target_domain = ""
+
+	def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
+		self.parserobots = parserobots
+		self.output 	= output
+		self.report 	= report
+		self.domain 	= domain
+		self.exclude 	= exclude
+		self.skipext 	= skipext
+		self.drop		= drop
+		self.debug		= debug
+
+		if self.debug:
+			logging.basicConfig(level=logging.DEBUG)
+
+		self.tocrawl = set([domain])
+
+		try:
+			self.target_domain = urlparse(domain)[1]
+		except:
+			raise ("Invalid domain")
+
+
+		if self.output:
+			try:
+				self.output_file = open(self.output, 'w')
+			except:
+				logging.debug ("Output file not available.")
+				exit(255)
+
+	def run(self):
+		print (config.xml_header, file=self.output_file)
+
+		logging.debug("Start the crawling process")
+		self.__crawling()
+		logging.debug("Crawling as reach the end of all found link")
+
+		print (config.xml_footer, file=self.output_file)
+
+
+	def __crawling(self):
+		crawling = self.tocrawl.pop()
+
+		url = urlparse(crawling)
+		self.crawled.add(crawling)
+		
+		try:
+			request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
+			response = urlopen(request)
+		except Exception as e:
+			if hasattr(e,'code'):
+				if e.code in self.response_code:
+					self.response_code[e.code]+=1
+				else:
+					self.response_code[e.code]=1
+			logging.debug ("{1} ==> {0}".format(e, crawling))
+			response.close()
+			return self.__continue_crawling()
+
+		# Read the response
+		try:
+			msg = response.read()
+			if response.getcode() in self.response_code:
+				self.response_code[response.getcode()]+=1
+			else:
+				self.response_code[response.getcode()]=1
+			response.close()
+		except Exception as e:
+			logging.debug ("{1} ===> {0}".format(e, crawling))
+			return self.__continue_crawling()
+
+
+		print ("<url><loc>"+url.geturl()+"</loc></url>", file=self.output_file)
+		if self.output_file:
+			self.output_file.flush()
+
+		# Found links
+		links = self.linkregex.findall(msg)
+		for link in links:
+			link = link.decode("utf-8")
+			#logging.debug("Found : {0}".format(link))		
+			if link.startswith('/'):
+				link = 'http://' + url[1] + link
+			elif link.startswith('#'):
+				link = 'http://' + url[1] + url[2] + link
+			elif not link.startswith('http'):
+				link = 'http://' + url[1] + '/' + link
+			
+			# Remove the anchor part if needed
+			if "#" in link:
+				link = link[:link.index('#')]
+
+			# Drop attributes if needed
+			for toDrop in self.drop:
+				link=re.sub(toDrop,'',link)
+
+			# Parse the url to get domain and file extension
+			parsed_link = urlparse(link)
+			domain_link = parsed_link.netloc
+			target_extension = os.path.splitext(parsed_link.path)[1][1:]
+
+			if (link in self.crawled):
+				continue
+			if (link in self.tocrawl):
+				continue
+			if (link in self.excluded):
+				continue
+			if (domain_link != self.target_domain):
+				continue
+			if ("javascript" in link):
+				continue
+			
+			# Count one more URL
+			self.nb_url+=1
+
+			# Check if the navigation is allowed by the robots.txt
+			if (not self.can_fetch(link)):
+				if link not in excluded:
+					self.excluded.add(link)
+				self.nb_rp+=1
+				continue
+
+			# Check if the current file extension is allowed or not.
+			if (target_extension in self.skipext):
+				if link not in excluded:
+					self.excluded.add(link)
+				self.nb_exclude+=1
+				continue
+
+			# Check if the current url doesn't contain an excluded word
+			if (not self.exclude_url(link)):
+				if link not in self.excluded:
+					self.excluded.add(link)
+				self.nb_exclude+=1
+				continue
+
+			self.tocrawl.add(link)
+
+		return self.__continue_crawling()
+
+	def __continue_crawling(self):
+		if self.tocrawl:
+			self.__crawling()
+
+	def checkRobots(self):
+		if self.domain[len(self.domain)-1] != "/":
+			self.domain += "/"
+		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
+		self.rp = RobotFileParser()
+		self.rp.set_url(self.domain+"robots.txt")
+		self.rp.read()
+
+	def can_fetch(self, link):
+		try:
+			if self.parserobots:
+				if self.rp.can_fetch("*", link):
+					return True
+				else:
+					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
+					return False
+
+			if not self.parserobots:
+				return True
+
+			return True
+		except:
+			# On error continue!
+			logging.debug ("Error during parsing robots.txt")
+			return True
+
+	def exclude_url(self, link):
+		for ex in self.exclude:
+			if ex in link:
+				return False
+		return True
+
+	def make_report(self):
+		print ("Number of found URL : {0}".format(self.nb_url))
+		print ("Number of link crawled : {0}".format(len(self.crawled)))
+		if self.parserobots:
+			print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
+		if self.skipext or self.exclude:
+			print ("Number of link exclude : {0}".format(self.nb_exclude))
+
+		for code in self.response_code:
+			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..16a6969
--- /dev/null
+++ b/test.py
@@ -0,0 +1,3 @@
+import crawler
+crawl = crawler.Crawler(domain="http://blog.lesite.us",debug=True)
+crawl.run()
\ No newline at end of file

From c712ad746cc4ba027134bff9b73130ad9ecfac78 Mon Sep 17 00:00:00 2001
From: Brosseau Valentin <c4software@gmail.com>
Date: Tue, 21 Aug 2012 22:29:05 +0200
Subject: [PATCH 2/3] =?UTF-8?q?Modification=20du=20projet.=20Pr=C3=A9parat?=
 =?UTF-8?q?ion=20du=20multithread,=20passage=20du=20moteur=20en=20class?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 __pycache__/crawler.cpython-32.pyc | Bin 7094 -> 7253 bytes
 crawler.py                         |  13 +-
 main.py                            | 215 +----------------------------
 test.py                            |   3 -
 4 files changed, 11 insertions(+), 220 deletions(-)
 delete mode 100644 test.py

diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc
index d0dbb2fae57bda0bfd2ba6e308171d79f1b21340..817c27a25589021a4ca9e0cc706e1180a17144da 100644
GIT binary patch
delta 883
zcmZ`%L2J}N6n-yBH@j`J*}B!*t-YkByS1$qMO%6hp%y$854NY3vTQe@bvN7Cq*0N@
zf{I?12ZC2mUc4BnAb9Zyc(Rv;dhsSGcoGz>Z<cHi+6kFAZ!+IE^L_7ScXn@1_YAXl
z@9Ozy`X9R&0a^fyE<hQ<$wFBL$AofECr@!sMI6B_E#e%SbRS1--W>vm@dbflu!sQb
z-Pg&|dt5ReF=A9~FPE&?P})lV0szcx_)T}__FCYFp<9jUX|5QL^}ImYCjCyG#p!rT
z|BmU4N=t_;t7l?6^Bi|xtE{_Tf>s(HA|Nlc`UE%{%qo=$*!qE+?9346>ZhTRg5dKJ
zgvo?3*(ES~DmgN=wEu57B+#wCFDz`u_=Y)yjk$MgUgWqH@8)KME@$_phHhASd`!Nz
z)o-GODb>QmnSNRiB)m(_H+)H`aRAI<O%Vw{Vise8M>x)*g&oTP4IKhI$?N<PJ7wac
zaIcn2+;+HD63i&gAyMx{Dle*zLkD?VqNI50U%R`^Nt9V}J9kgo%v>KMYa4E@aou0@
zx5Ibw%VRrIG*t#n{6Nitw=yN2^SXtP0`F5C9G{o|XY_T@!Z!UH9LBNu#7R1$Z?A?{
jWq0-KtbHM<-tpW(*a+uorEpHIW;A+I_$Xr$4Uhf>rA*rV

delta 769
zcmaKqzb`{k6vxkb@99(C^GH>zSX9y<NShj3Mj<hXfvq8eTR+-rLmPuMm9UCLP9ppZ
z)*HkiVli68z$7sWiNS)1`|8)!o7|k7ocleW^W)x|#Bp5D8fM+&er#9&>_h}80Su=A
zk_gO%qzcS{WKG3)2%iGjfhG-O0u3r-N2j=L1t#=9TO-7_1z7k!%%n3MRVRWC+^yUx
zo_+v`ET`DgOk|O<$l6Rcl`Agg=*$=3p_&7f3FyNS#d+SVKVxlPDd{jYwZQ$(J={1~
zOSMq!f{YHUhpHGM-34h4L~siMbCs#IbP$*Zab-FPD=(FxVNaV5LB;{AiuR9qs)`8z
zAi}>0k>3sd--QYzHsiax-ASKTt&=|tCF&orhqoI~NcWx=iZ)zIHxRX4xJ^UWipM|~
zTIf(M{P!$$(WSL1C2Tf&j3e;~1x*-0CoeaxOSj%wFLZN@skzK}el}k$obo}}hBSlT
zn@zkRr~5?O$(!54YhKqfApI3o_F8yLJKi?*^FuEojrk@e+&Cua)fYoX;kGM@E<mw%
F{}<f4##jIV

diff --git a/crawler.py b/crawler.py
index d4a3a1f..fb4a77c 100644
--- a/crawler.py
+++ b/crawler.py
@@ -155,22 +155,19 @@ def __crawling(self):
 
 			# Check if the navigation is allowed by the robots.txt
 			if (not self.can_fetch(link)):
-				if link not in excluded:
-					self.excluded.add(link)
+				self.exclude_link(link)
 				self.nb_rp+=1
 				continue
 
 			# Check if the current file extension is allowed or not.
 			if (target_extension in self.skipext):
-				if link not in excluded:
-					self.excluded.add(link)
+				self.exclude_link(link)
 				self.nb_exclude+=1
 				continue
 
 			# Check if the current url doesn't contain an excluded word
 			if (not self.exclude_url(link)):
-				if link not in self.excluded:
-					self.excluded.add(link)
+				self.exclude_link(link)
 				self.nb_exclude+=1
 				continue
 
@@ -182,6 +179,10 @@ def __continue_crawling(self):
 		if self.tocrawl:
 			self.__crawling()
 
+	def exclude_link(self,link):
+		if link not in self.excluded:
+			self.excluded.add(link)
+
 	def checkRobots(self):
 		if self.domain[len(self.domain)-1] != "/":
 			self.domain += "/"
diff --git a/main.py b/main.py
index 15a18b6..a48982a 100755
--- a/main.py
+++ b/main.py
@@ -1,43 +1,9 @@
-import re
-from urllib.request import urlopen, Request
-from urllib.robotparser import RobotFileParser
-from urllib.parse import urlparse
-
 import argparse
 import os
 
 import json
-import logging
-
-def can_fetch(parserobots, rp, link, debug=False):
-	try:
-		if parserobots:
-			if rp.can_fetch("*", link):
-				return True
-			else:
-				if debug:
-					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
-				return False
-
-		if not parserobots:
-			return True
 
-		return True
-	except:
-		# On error continue!
-		if debug:
-			logging.debug ("Error during parsing robots.txt")
-		return True
-
-
-def exclude_url(exclude, link):
-	if exclude:
-		for ex in exclude:
-			if ex in link:
-				return False
-		return True
-	else:
-		return True
+import crawler
 
 # Gestion des parametres
 parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
@@ -62,10 +28,7 @@ def exclude_url(exclude, link):
 		config = json.load(config_data)
 		config_data.close()
 	except Exception as e:
-		if arg.debug:
-			logging.debug ("Bad or unavailable config file")
 		config = {}
-		print(e)
 else:
 	config = {}
 
@@ -82,177 +45,7 @@ def exclude_url(exclude, link):
 				dict_arg[argument] = config[argument]
 		else:
 			dict_arg[argument] = config[argument]
-	else:
-		logging.error ("Unknown flag in JSON")
-		
-if arg.debug:
-	logging.basicConfig(level=logging.DEBUG)
-	logging.debug ("Configuration : ")
-	logging.debug (arg)
-
-output_file = None
-if arg.output:
-	try:
-		output_file = open(arg.output, 'w')
-	except:
-		if not arg.debug:
-			logging.debug ("Output file not available.")
-			exit(255)
-		else:
-			logging.debug ("Continue without output file.")
-
-tocrawl = set([arg.domain])
-crawled = set([])
-excluded = set([])
-# TODO also search for window.location={.*?}
-linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')
-
-header = """<?xml version="1.0" encoding="UTF-8"?>
-<urlset
-      xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
-      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-      xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
-            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
-"""
-footer = "</urlset>"
-
-try:
-	target_domain = urlparse(arg.domain)[1]
-except:
-	logging.debug ("Invalid domain")
-
-rp = None
-if arg.parserobots:
-	if arg.domain[len(arg.domain)-1] != "/":
-		arg.domain += "/"
-	request = Request(arg.domain+"robots.txt", headers={"User-Agent":'Sitemap crawler'})
-	rp = RobotFileParser()
-	rp.set_url(arg.domain+"robots.txt")
-	rp.read()
-
-response_code={}
-nb_url=1 # Number of url.
-nb_rp=0 # Number of url blocked by the robots.txt
-nb_exclude=0 # Number of url excluded by extension or word
-print (header, file=output_file)
-while tocrawl:
-	crawling = tocrawl.pop()
-
-
-	url = urlparse(crawling)
-	crawled.add(crawling)
-	
-	try:
-		request = Request(crawling, headers={"User-Agent":'Sitemap crawler'})
-		response = urlopen(request)
-	except Exception as e:
-		if hasattr(e,'code'):
-			if e.code in response_code:
-				response_code[e.code]+=1
-			else:
-				response_code[e.code]=1
-		#else:
-		#	response_code['erreur']+=1
-		if arg.debug:
-			logging.debug ("{1} ==> {0}".format(e, crawling))
-		response.close()
-		continue
-
-	# Read the response
-	try:
-		msg = response.read()
-		if response.getcode() in response_code:
-			response_code[response.getcode()]+=1
-		else:
-			response_code[response.getcode()]=1
-		response.close()
-	except Exception as e:
-		if arg.debug:
-			logging.debug ("{1} ===> {0}".format(e, crawling))
-		continue
-
-
-	print ("<url><loc>"+url.geturl()+"</loc></url>", file=output_file)
-	if output_file:
-		output_file.flush()
-
-	# Found links
-	links = linkregex.findall(msg)
-	for link in links:
-		link = link.decode("utf-8")
-		if link.startswith('/'):
-			link = 'http://' + url[1] + link
-		elif link.startswith('#'):
-			link = 'http://' + url[1] + url[2] + link
-		elif not link.startswith('http'):
-			link = 'http://' + url[1] + '/' + link
-		
-		# Remove the anchor part if needed
-		if "#" in link:
-			link = link[:link.index('#')]
-
-		# Drop attributes if needed
-		if arg.drop is not None:
-			for toDrop in arg.drop:
-				link=re.sub(toDrop,'',link)
-
-		# Parse the url to get domain and file extension
-		parsed_link = urlparse(link)
-		domain_link = parsed_link.netloc
-		target_extension = os.path.splitext(parsed_link.path)[1][1:]
-
-		if (link in crawled):
-			continue
-		if (link in tocrawl):
-			continue
-		if (link in excluded):
-			continue
-		if (domain_link != target_domain):
-			continue
-		if ("javascript" in link):
-			continue
-		
-		# Count one more URL
-		nb_url+=1
-
-		# Check if the navigation is allowed by the robots.txt
-		if (not can_fetch(arg.parserobots, rp, link, arg.debug)):
-			if link not in excluded:
-				excluded.add(link)
-			nb_rp+=1
-			continue
-
-		# Check if the current file extension is allowed or not.
-		if (target_extension in arg.skipext):
-			if link not in excluded:
-				excluded.add(link)
-			nb_exclude+=1
-			continue
-
-		# Check if the current url doesn't contain an excluded word
-		if (not exclude_url(arg.exclude, link)):
-			if link not in excluded:
-				excluded.add(link)
-			nb_exclude+=1
-			continue
-
-		tocrawl.add(link)
-print (footer, file=output_file)
-
-if arg.debug:
-	logging.debug ("Number of found URL : {0}".format(nb_url))
-	logging.debug ("Number of link crawled : {0}".format(len(crawled)))
-
-if arg.report:
-	print ("Number of found URL : {0}".format(nb_url))
-	print ("Number of link crawled : {0}".format(len(crawled)))
-	if arg.parserobots:
-		print ("Number of link block by robots.txt : {0}".format(nb_rp))
-	if arg.skipext or arg.exclude:
-		print ("Number of link exclude : {0}".format(nb_exclude))
-
-	for code in response_code:
-		print ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
+del(dict_arg['config'])
 
-if output_file:
-	output_file.close()
+crawl = crawler.Crawler(**dict_arg)
+crawl.run()
\ No newline at end of file
diff --git a/test.py b/test.py
deleted file mode 100644
index 16a6969..0000000
--- a/test.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import crawler
-crawl = crawler.Crawler(domain="http://blog.lesite.us",debug=True)
-crawl.run()
\ No newline at end of file

From 274c392c55a5db3e3dd7d41f7609337b38b3b5d8 Mon Sep 17 00:00:00 2001
From: Brosseau Valentin <c4software@gmail.com>
Date: Tue, 21 Aug 2012 22:33:15 +0200
Subject: [PATCH 3/3] Ajout du rapport a la fin du process

---
 main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index a48982a..c6c38a6 100755
--- a/main.py
+++ b/main.py
@@ -48,4 +48,7 @@
 del(dict_arg['config'])
 
 crawl = crawler.Crawler(**dict_arg)
-crawl.run()
\ No newline at end of file
+crawl.run()
+
+if arg.report:
+	crawl.make_report()
\ No newline at end of file