From 3d380159e96bdb3f096def3d64b267d299f60478 Mon Sep 17 00:00:00 2001 From: Brosseau Valentin Date: Mon, 20 Aug 2012 22:08:21 +0200 Subject: [PATCH 1/3] Creation de la class pour crawler les sites --- __pycache__/config.cpython-32.pyc | Bin 0 -> 519 bytes __pycache__/crawler.cpython-32.pyc | Bin 0 -> 7094 bytes config.py | 10 ++ crawler.py | 226 +++++++++++++++++++++++++++++ test.py | 3 + 5 files changed, 239 insertions(+) create mode 100644 __pycache__/config.cpython-32.pyc create mode 100644 __pycache__/crawler.cpython-32.pyc create mode 100644 config.py create mode 100644 crawler.py create mode 100644 test.py diff --git a/__pycache__/config.cpython-32.pyc b/__pycache__/config.cpython-32.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1845021097475be70224d817560b13a12be50285 GIT binary patch literal 519 zcmb7B!A=4(5bc6N-Dpfac;58Hh3$eDCClQ;6B-jVF{kb3nvE_VEljwZqNxrX6OSl3y5fuS&N7^nRZXGg~*AN?9 zFyKoBz;;QN#P{@&IoyEy$F6Q`-XKZ(?EPsnt@%Br!R4CEJr~rlxq4!F?&^7u7-2U- zv%wL(-!Ta|dZCJV{pruJn68ZQrOZw-hV3tQf==rPW*UzD literal 0 HcmV?d00001 diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0dbb2fae57bda0bfd2ba6e308171d79f1b21340 GIT binary patch literal 7094 zcmcIpO>-Pa8Sa@`t#(%{t!yWfH?iZjosW%^cu6QoNGS&5kAx~pF_D6j4X&xxPOmhp z*_qXRBvN99DjW`};sXy`K zMs;VD#gUE*%4#&Ly0fNTQb9$Hs;XNxZOmL#-I{vCn^B*FzpNf$#yKvz)}F=kJ|4B& zw(-u`&f*{%>rm*o^=_t9WwXRI*!9TX`ivT$ zrD|k5C(}Mc1-p+b4_YtLT-{}PpO>#qS@bb30eSdVT*dsdSyG=?RQGXZtk=Cj1Hz2m zi^`i7;tAzdgt(-N&b%4eTxQlLD)i|=?H;lNeDD&gg~<_1e&u# zpji<@n!{NYZ>hARhEB?8 zOjolRs2%7JbRhQ0Xw4slAM^s>YkASA=Z6{X_F+cjzLllpENu-CsjVuEkqi_v*5fEnh0N!l1%uw}2brg3dn7ylSns7-3H@Fi zjYULHZ)RKaIf%Bl{BTRw+3Y2L|7H{p{4L?Sb^G;q-pH5-5}gWn8ujDehe0;Io3eQ@ zq`i1cr*19-*OL87>5a55)f6H0o}acEE7D4IF!+Be*~UUjUfFj*bw=9rd!231>%c2I zO#QHP3s%{T_Bwa>)9onijN_;yTI#s?dwskwD!Z;9`l;(ahnbRVoY|@#{byILCHt)H zSj(1U%{f)Pn|S7}KK;%g{-u3nm#l+G@oT9e@~3Qy~H-r?;SLUnfaMeQwgR?Cd42Soiwd!m2~B z6%AUwAZQJu3=OdCw8i}l^>OTnCi?m&Y-;G-8wKvR?s+;E$4>j7!C6Ec$D250pu4iZ zj$OBqE-U65L{VC#aq7j8IW@O3y5lT-gY$Ah1^>-kHEZ5J^cds&3Lf#;020t0k}BYl z10*A43rRi)k~ZMLA<~)jSo_bb6e$f^#ZpKKX;~79t-cK(A2`foNQv;}K?&*Ub5%;K zYNw{+e<|w#wezC`mDZRdk+YDK;x9-Kk?yd1UG0=r{HvTc{WDD@ua}*b-7lEM443(` z!aO;zh6~JML!>$p3Ue}!kSK?X5?&l~hSb`|dx#_aLX1ylkmZmq_kY5airPiBk)A=H zBR!GkTUNei@u?6fM4dYnsQ9=tS$89R^HBHCVHxV)E(C_njtl(X#7rMy*+a=0K1%79 zQTmoq^(fgA)U`y$B^CcACx**pV<92Sc{OaZ7SSiOYA>=;j&9xCvJc=XQ~^|B)E#^? zP1M{uHsfnHswdWjmd#}_E`uRlmm%Pp7oHy<;Q{Z$6mPrWZICzlWSSRi9L@=+pt4oO z`Mc$9_@TmI`0&n zCXSy{JC2Iq&pW*{oR*P2D%g-Nhfj0$mu=Um^h_>q@<#{EDW(NF_>Dtho|L^O+T=wN+Tma$8dDgN=5Q(ViZxK>Po@y``fyvZrd;uPeYrLPj?q**W#Hii)x z;%#uO*@063SEcNlY???$d_GZ@iz{#`e8fo zV0v2ta=x(=MEx7`eWgQkrNiDywP0q>(!mR_2w3SD5z=)0YNsPTSA7BEC9rLyTi~L| zHUxx8`msMw+s|_jruzaiIH1RXA5uHwKroFkNWRye;EbNug~X;c6llKGF8N%TV6hw37k20@l=XVif;Hr88uPlOxz zp@;fwFrJn<>B)rUJo(U1w?#rQfuAlBkVgl_+F-YAQ^tlmg|jn$i4x%)_b{0LNXCJm z5+t&bgGcWKTt};fn=Un=oc&(t4xpxy03YJ9NL)ZpiizF=jWR%Unp3n{S49rNfL!{} zk_;BELhr`rd?&IMu9Gk{>WvZuR%tY&7Aes;Y2= zpaa0e8BwNo0gzxl!Fy%_4#FaWI?#=j4*|uN6l>KaaZ1~fTx!mH*Hi3D?t+s`fH_BT ze17tBK;l>k|w@v;Y`3d#7<@pH@_g-srzcuCD zOJKbn$5Gt!GMuJxD8m&M|4w+;@0b%8jWT@~K)vCjqp6r~S&y@?S=jFMf)w*$VGF;O zptu_M?%HlmZD6~uwK;0_&bs`S@GMFY>#V3Ou}m`=p2Ju!beHhgd#7fCW3}(kkRz$! zrI7vWR^@1#;(iyUe+L5b4Bc=q#fX;k8pD*mIN>1S$c-8>Bzn!SB$yc$ugg&6Co@!z zG#r{o_pC|=GG|4_KNVJlwqdOnGw~F0U6^V(MO>A`E*8cK5{!lyM+QqZ^MmB>XK(j~wtYri#pJaHuyy7G|%?91p`F zy0ZwtV~~gtsK7J=M^D&_M0}?jhojz(cFo1%U$K5d{DW%`AdxDt2JnWxXszC9Uq!Eg zF4qlvBkj6UmRxrfd0D{bWHTL4Ox#7QQK$uloqSuFqxz}RHEkx+1&ZWk$1X1Ei{n?4 z98&m&J1e)iX9}4~hy7?Y<~1Y}6648$>^zM=etyl>mU_+hl(P*Aqx`OmeT2;Kx~CRcG{00l9dp^_ z6})Or8DO+*=MO1>Ui`Bg>@)w+SGF%-sMe}pwN$NC&sNKlvh&H>toq`KJYu*|ab6S8 zX}M(wJ_-EI>&0P#Te&Aqo&lJm{O=y axNcrCTkQrxVX}Z1O;^WGUBt + +""" +xml_footer = "" + +crawler_user_agent = 'Sitemap crawler' \ No newline at end of file diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..d4a3a1f --- /dev/null +++ b/crawler.py @@ -0,0 +1,226 @@ +import config +import logging + +import re +from urllib.request import urlopen, Request +from urllib.robotparser import RobotFileParser +from urllib.parse import urlparse + +import os + +class Crawler(): + + # Variables + parserobots = False + output = None + report = False + + config = None + domain = "" + + exclude = [] + skipext = [] + drop = [] + + debug = False + + tocrawl = set([]) + crawled = set([]) + excluded = set([]) + # TODO also search for window.location={.*?} + linkregex = re.compile(b'') + + rp = None + response_code={} + nb_url=1 # Number of url. + nb_rp=0 # Number of url blocked by the robots.txt + nb_exclude=0 # Number of url excluded by extension or word + + output_file = None + + target_domain = "" + + def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False): + self.parserobots = parserobots + self.output = output + self.report = report + self.domain = domain + self.exclude = exclude + self.skipext = skipext + self.drop = drop + self.debug = debug + + if self.debug: + logging.basicConfig(level=logging.DEBUG) + + self.tocrawl = set([domain]) + + try: + self.target_domain = urlparse(domain)[1] + except: + raise ("Invalid domain") + + + if self.output: + try: + self.output_file = open(self.output, 'w') + except: + logging.debug ("Output file not available.") + exit(255) + + def run(self): + print (config.xml_header, file=self.output_file) + + logging.debug("Start the crawling process") + self.__crawling() + logging.debug("Crawling as reach the end of all found link") + + print (config.xml_footer, file=self.output_file) + + + def __crawling(self): + crawling = self.tocrawl.pop() + + url = urlparse(crawling) + self.crawled.add(crawling) + + try: + request = Request(crawling, headers={"User-Agent":config.crawler_user_agent}) + response = urlopen(request) + except Exception as e: + if hasattr(e,'code'): + if e.code in self.response_code: + self.response_code[e.code]+=1 + else: + self.response_code[e.code]=1 + logging.debug ("{1} ==> {0}".format(e, crawling)) + response.close() + return self.__continue_crawling() + + # Read the response + try: + msg = response.read() + if response.getcode() in self.response_code: + self.response_code[response.getcode()]+=1 + else: + self.response_code[response.getcode()]=1 + response.close() + except Exception as e: + logging.debug ("{1} ===> {0}".format(e, crawling)) + return self.__continue_crawling() + + + print (""+url.geturl()+"", file=self.output_file) + if self.output_file: + self.output_file.flush() + + # Found links + links = self.linkregex.findall(msg) + for link in links: + link = link.decode("utf-8") + #logging.debug("Found : {0}".format(link)) + if link.startswith('/'): + link = 'http://' + url[1] + link + elif link.startswith('#'): + link = 'http://' + url[1] + url[2] + link + elif not link.startswith('http'): + link = 'http://' + url[1] + '/' + link + + # Remove the anchor part if needed + if "#" in link: + link = link[:link.index('#')] + + # Drop attributes if needed + for toDrop in self.drop: + link=re.sub(toDrop,'',link) + + # Parse the url to get domain and file extension + parsed_link = urlparse(link) + domain_link = parsed_link.netloc + target_extension = os.path.splitext(parsed_link.path)[1][1:] + + if (link in self.crawled): + continue + if (link in self.tocrawl): + continue + if (link in self.excluded): + continue + if (domain_link != self.target_domain): + continue + if ("javascript" in link): + continue + + # Count one more URL + self.nb_url+=1 + + # Check if the navigation is allowed by the robots.txt + if (not self.can_fetch(link)): + if link not in excluded: + self.excluded.add(link) + self.nb_rp+=1 + continue + + # Check if the current file extension is allowed or not. + if (target_extension in self.skipext): + if link not in excluded: + self.excluded.add(link) + self.nb_exclude+=1 + continue + + # Check if the current url doesn't contain an excluded word + if (not self.exclude_url(link)): + if link not in self.excluded: + self.excluded.add(link) + self.nb_exclude+=1 + continue + + self.tocrawl.add(link) + + return self.__continue_crawling() + + def __continue_crawling(self): + if self.tocrawl: + self.__crawling() + + def checkRobots(self): + if self.domain[len(self.domain)-1] != "/": + self.domain += "/" + request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent}) + self.rp = RobotFileParser() + self.rp.set_url(self.domain+"robots.txt") + self.rp.read() + + def can_fetch(self, link): + try: + if self.parserobots: + if self.rp.can_fetch("*", link): + return True + else: + logging.debug ("Crawling of {0} disabled by robots.txt".format(link)) + return False + + if not self.parserobots: + return True + + return True + except: + # On error continue! + logging.debug ("Error during parsing robots.txt") + return True + + def exclude_url(self, link): + for ex in self.exclude: + if ex in link: + return False + return True + + def make_report(self): + print ("Number of found URL : {0}".format(self.nb_url)) + print ("Number of link crawled : {0}".format(len(self.crawled))) + if self.parserobots: + print ("Number of link block by robots.txt : {0}".format(self.nb_rp)) + if self.skipext or self.exclude: + print ("Number of link exclude : {0}".format(self.nb_exclude)) + + for code in self.response_code: + print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code])) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..16a6969 --- /dev/null +++ b/test.py @@ -0,0 +1,3 @@ +import crawler +crawl = crawler.Crawler(domain="http://blog.lesite.us",debug=True) +crawl.run() \ No newline at end of file From c712ad746cc4ba027134bff9b73130ad9ecfac78 Mon Sep 17 00:00:00 2001 From: Brosseau Valentin Date: Tue, 21 Aug 2012 22:29:05 +0200 Subject: [PATCH 2/3] =?UTF-8?q?Modification=20du=20projet.=20Pr=C3=A9parat?= =?UTF-8?q?ion=20du=20multithread,=20passage=20du=20moteur=20en=20class?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- __pycache__/crawler.cpython-32.pyc | Bin 7094 -> 7253 bytes crawler.py | 13 +- main.py | 215 +---------------------------- test.py | 3 - 4 files changed, 11 insertions(+), 220 deletions(-) delete mode 100644 test.py diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc index d0dbb2fae57bda0bfd2ba6e308171d79f1b21340..817c27a25589021a4ca9e0cc706e1180a17144da 100644 GIT binary patch delta 883 zcmZ`%L2J}N6n-yBH@j`J*}B!*t-YkByS1$qMO%6hp%y$854NY3vTQe@bvN7Cq*0N@ zf{I?12ZC2mUc4BnAb9Zyc(Rv;dhsSGcoGz>Zvm@dbflu!sQb z-Pg&|dt5ReF=A9~FPE&?P})lV0szcx_)T}__FCYFp<9jUX|5QL^}ImYCjCyG#p!rT z|BmU4N=t_;t7l?6^Bi|xtE{_Tf>s(HA|Nlc`UE%{%qo=$*!qE+?9346>ZhTRg5dKJ zgvo?3*(ES~DmgN=wEu57B+#wCFDz`u_=Y)yjk$MgUgWqH@8)KME@$_phHhASd`!Nz z)o-GODb>QmnSNRiB)m(_H+)H`aRAIx)*g&oTP4IKhI$?NKMYa4E@aou0@ zx5Ibw%VRrIG*t#n{6Nitw=yN2^SXtP0`F5C9G{o|XY_T@!Z!UH9LBNu#7R1$Z?A?{ jWq0-KtbHM<-tpW(*a+uorEpHIW;A+I_$Xr$4Uhf>rA*rV delta 769 zcmaKqzb`{k6vxkb@99(C^GH>zSX9y_h}80Su=A zk_gO%qzcS{WKG3)2%iGjfhG-O0u3r-N2j=L1t#=9TO-7_1z7k!%%n3MRVRWC+^yUx zo_+v`ET`DgOk|O<$l6Rcl`Agg=*$=3p_&7f3FyNS#d+SVKVxlPDd{jYwZQ$(J={1~ zOSMq!f{YHUhpHGM-34h4L~siMbCs#IbP$*Zab-FPD=(FxVNaV5LB;{AiuR9qs)`8z zAi}>0k>3sd--QYzHsiax-ASKTt&=|tCF&orhqoI~NcWx=iZ)zIHxRX4xJ^UWipM|~ zTIf(M{P!$$(WSL1C2Tf&j3e;~1x*-0CoeaxOSj%wFLZN@skzK}el}k$obo}}hBSlT zn@zkRr~5?O$(!54YhKqfApI3o_F8yLJKi?*^FuEojrk@e+&Cua)fYoX;kGM@E') - -header = """ - -""" -footer = "" - -try: - target_domain = urlparse(arg.domain)[1] -except: - logging.debug ("Invalid domain") - -rp = None -if arg.parserobots: - if arg.domain[len(arg.domain)-1] != "/": - arg.domain += "/" - request = Request(arg.domain+"robots.txt", headers={"User-Agent":'Sitemap crawler'}) - rp = RobotFileParser() - rp.set_url(arg.domain+"robots.txt") - rp.read() - -response_code={} -nb_url=1 # Number of url. -nb_rp=0 # Number of url blocked by the robots.txt -nb_exclude=0 # Number of url excluded by extension or word -print (header, file=output_file) -while tocrawl: - crawling = tocrawl.pop() - - - url = urlparse(crawling) - crawled.add(crawling) - - try: - request = Request(crawling, headers={"User-Agent":'Sitemap crawler'}) - response = urlopen(request) - except Exception as e: - if hasattr(e,'code'): - if e.code in response_code: - response_code[e.code]+=1 - else: - response_code[e.code]=1 - #else: - # response_code['erreur']+=1 - if arg.debug: - logging.debug ("{1} ==> {0}".format(e, crawling)) - response.close() - continue - - # Read the response - try: - msg = response.read() - if response.getcode() in response_code: - response_code[response.getcode()]+=1 - else: - response_code[response.getcode()]=1 - response.close() - except Exception as e: - if arg.debug: - logging.debug ("{1} ===> {0}".format(e, crawling)) - continue - - - print (""+url.geturl()+"", file=output_file) - if output_file: - output_file.flush() - - # Found links - links = linkregex.findall(msg) - for link in links: - link = link.decode("utf-8") - if link.startswith('/'): - link = 'http://' + url[1] + link - elif link.startswith('#'): - link = 'http://' + url[1] + url[2] + link - elif not link.startswith('http'): - link = 'http://' + url[1] + '/' + link - - # Remove the anchor part if needed - if "#" in link: - link = link[:link.index('#')] - - # Drop attributes if needed - if arg.drop is not None: - for toDrop in arg.drop: - link=re.sub(toDrop,'',link) - - # Parse the url to get domain and file extension - parsed_link = urlparse(link) - domain_link = parsed_link.netloc - target_extension = os.path.splitext(parsed_link.path)[1][1:] - - if (link in crawled): - continue - if (link in tocrawl): - continue - if (link in excluded): - continue - if (domain_link != target_domain): - continue - if ("javascript" in link): - continue - - # Count one more URL - nb_url+=1 - - # Check if the navigation is allowed by the robots.txt - if (not can_fetch(arg.parserobots, rp, link, arg.debug)): - if link not in excluded: - excluded.add(link) - nb_rp+=1 - continue - - # Check if the current file extension is allowed or not. - if (target_extension in arg.skipext): - if link not in excluded: - excluded.add(link) - nb_exclude+=1 - continue - - # Check if the current url doesn't contain an excluded word - if (not exclude_url(arg.exclude, link)): - if link not in excluded: - excluded.add(link) - nb_exclude+=1 - continue - - tocrawl.add(link) -print (footer, file=output_file) - -if arg.debug: - logging.debug ("Number of found URL : {0}".format(nb_url)) - logging.debug ("Number of link crawled : {0}".format(len(crawled))) - -if arg.report: - print ("Number of found URL : {0}".format(nb_url)) - print ("Number of link crawled : {0}".format(len(crawled))) - if arg.parserobots: - print ("Number of link block by robots.txt : {0}".format(nb_rp)) - if arg.skipext or arg.exclude: - print ("Number of link exclude : {0}".format(nb_exclude)) - - for code in response_code: - print ("Nb Code HTTP {0} : {1}".format(code, response_code[code])) +del(dict_arg['config']) -if output_file: - output_file.close() +crawl = crawler.Crawler(**dict_arg) +crawl.run() \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100644 index 16a6969..0000000 --- a/test.py +++ /dev/null @@ -1,3 +0,0 @@ -import crawler -crawl = crawler.Crawler(domain="http://blog.lesite.us",debug=True) -crawl.run() \ No newline at end of file From 274c392c55a5db3e3dd7d41f7609337b38b3b5d8 Mon Sep 17 00:00:00 2001 From: Brosseau Valentin Date: Tue, 21 Aug 2012 22:33:15 +0200 Subject: [PATCH 3/3] Ajout du rapport a la fin du process --- main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index a48982a..c6c38a6 100755 --- a/main.py +++ b/main.py @@ -48,4 +48,7 @@ del(dict_arg['config']) crawl = crawler.Crawler(**dict_arg) -crawl.run() \ No newline at end of file +crawl.run() + +if arg.report: + crawl.make_report() \ No newline at end of file