Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
saucer-man committed Nov 11, 2018
1 parent df59a42 commit eebe58f
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 136 deletions.
9 changes: 4 additions & 5 deletions README.md
@@ -1,11 +1,10 @@
UrlCrawler
author :gengyanqing
author : saucerman
description: python写的全站url爬取脚本,爬取网站的全部url用于分析站点目录

python写的全站url爬取脚本,爬取网站的全部url用于分析站点目录
## 说明

主要的实现方法是循环,具体步骤看下图:
暂未使用多进程或多线程,速度不快,可自行修改源码

![这里写图片描述](https://wx3.sinaimg.cn/mw690/005GjT4tgy1fqq62nulyej30ce0kwjs2.jpg)

此方法只能爬取网页链接性的url,不能爬取js动态生成的url

217 changes: 86 additions & 131 deletions crawler.py
@@ -1,146 +1,101 @@
# author: saucer_man
# date:2018-04-24
# python3.6

# /usr/bin/env python3
# _*_ coding:utf-8 _*_
# auther: saucerman
# project: https://github.com/saucer-man/UrlCrawler

"""
decription : 全站url爬取脚本
"""
import re
import time
import sys
import requests
try :
import tldextract
except:
print('module tldextract not fount \nyou try pip install tldextract')
sys.exit()


# 获取并检验要爬取的网站
def url_get():
url=input("please input the url:")
def domain_get():
'''
接收要爬取的网站url
'''
url = input("Please input the url of website:")
if '//' not in url:
url = 'http://' + url
try:
kv={'user_agent':'Mozilla/5.0'}
requests.get(url,headers=kv)
kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
requests.head(url,headers=kv)
return url
except:
print("your url is incorrect!!")
return url_get()

'''
找出url中的域名
比如从https://www.xiaogeng.top/article/page/id=3筛选出www.xiaogeng.top
'''
def url_same(url):

#判断输入的网站使用的是https还是http
urlprotocol=re.findall(r'.*(?=://)',url)[0]
print('该站使用的协议是:' + urlprotocol)

if len(re.findall(r'/',url)) >2:
if urlprotocol=='https':
sameurl = re.findall(r'(?<=https://).*?(?=/)', url)[0]
else:
sameurl = re.findall(r'(?<=http://).*?(?=/)', url)[0]
else:
url = url + '/'
if urlprotocol=='https':
sameurl = re.findall(r'(?<=https://).*?(?=/)',url)[0]
else:
sameurl = re.findall(r'(?<=http://).*?(?=/)',url)[0]

print('域名地址:' + sameurl)
return sameurl
return domain_get()


# 爬取url页面中的所有链接
def spiderpage(url):
kv={'user_agent':'Mozilla/5.0'}
r=requests.get(url,headers=kv)
r.encoding=r.apparent_encoding
pagetext=r.text
pagelinks = re.findall(r'(?<=href=\").*?(?=\")|(?<=href=\').*?(?=\')',pagetext)
return pagelinks
class spider():
def __init__(self, domain, key, depth):
self.domain = domain # 爬取的域名
self.depth = depth # 爬取的深度
self.urls_all = set([]) # 爬取的结果
self.key = key # 顶级域名,用于排除外链

#筛选pagelinks中的url
def url_filtrate(pagelinks):
'''
print("我现在在筛选")
'''
#去除不是该站点的url
same_target_url = []
for l in pagelinks:
if re.findall(sameurl,l):
same_target_url.append(l)
#去除重复url
unrepect_url = []
for l in same_target_url:
if l not in unrepect_url:
unrepect_url.append(l)
return unrepect_url
#将一个列表写入文件
def writetofile(list):
file=open('urls.txt','w')
for url in list:
file.write(url)
file.write('\n')
file.close()

# url集合,循环遍历会用到
class linkQuence:
def __init__(self):
#已访问的url集合
self.visited=[]
#待访问的url集合
self.unvisited=[]
#获取访问过的url队列
def getvisitedurl(self):
return self.visited
#获取未访问的url队列
def getunvisitedurl(self):
return self.unvisited
#添加url到访问过得队列中
def addvisitedurl(self,url):
return self.visited.append(url)
#移除访问过得url
def removevisitedurl(self,url):
return self.visited.remove(url)
#从未访问队列中取一个url
def unvisitedurldequence(self):
try:
return self.unvisited.pop()
except:
return None
#添加url到未访问的队列中
def addunvisitedurl(self,url):
if url!="" and url not in self.visited and url not in self.unvisited:
return self.unvisited.insert(0,url)
#获得已访问的url数目
def getvisitedurlount(self):
return len(self.visited)
#获得未访问的url数目
def getunvistedurlcount(self):
return len(self.unvisited)
#判断未访问的url队列是否为空
def unvisitedurlsempty(self):
return len(self.unvisited)==0
def page_spider(self, url):
'''
爬取url中的所有链接
'''
try:
kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
r = requests.get(url, headers=kv, timeout = 2)
r.encoding=r.apparent_encoding
pagetext=r.text
pagelinks = re.findall(r'(?<=href=\").*?(?=\")|(?<=href=\').*?(?=\')',pagetext)

except:
return set([])
# 接下来对爬取的链接进行处理

# 1、先去除不同域的链接
url_list = set([])
for url in pagelinks:
if self.key in url:
url_list.add(url)

# 2、再对链接进行去重处理
url_list = set(url_list)-self.urls_all
self.urls_all.update(url_list)
return url_list # 返回集合



def run(self):
url_list = set([self.domain]) # 第一次爬取原始url的链接
while self.depth >= 1: # 每一次深度的爬取都会爬取url_list的所有链接
print("倒数第%d轮"%self.depth)
url_list_tmp = set([])
for url in url_list:
url_list_tmp.update(self.page_spider(url))
url_list = url_list_tmp
self.depth = self.depth -1

# 真正的爬取函数
class Spider():
def __init__(self,url):
self.linkQuence = linkQuence() #引入linkQuence类
self.linkQuence.addunvisitedurl(url) #并将需要爬取的url添加进linkQuence对列中

def crawler(self):
while not self.linkQuence.unvisitedurlsempty():# 若未访问队列非空
print("嘀嘀嘀我又爬到一个")
visitedurl = self.linkQuence.unvisitedurldequence()# 取一个url
if visitedurl is None or visitedurl == '':
continue
initial_links=spiderpage(visitedurl) # 爬出该url页面中所有的链接
right_links = url_filtrate(initial_links) # 筛选出合格的链接
self.linkQuence.addvisitedurl(visitedurl) # 将该url放到访问过的url队列中
for link in right_links: # 将筛选出的链接放到未访问队列中
self.linkQuence.addunvisitedurl(link)
# print(self.linkQuence.visited)
print("哥我爬完了")
return self.linkQuence.visited
file=open('result.txt','w')
for url in self.urls_all:
file.write(url)
file.write('\n')
file.close()




if __name__ == '__main__':
url=url_get()
sameurl=url_same(url)
spider=Spider(url)
urllist=spider.crawler()
writetofile(urllist)

time.clock()
domain = domain_get()
print('domain:', domain)
key = tldextract.extract(domain).domain # 获取顶级域名 'https://xiaogeng.top'-->'xiaogeng'
print('key:', key)
spider = spider(domain = domain, key = key, depth = 3)
spider.run()
print('结果已保存至result.txt中')
print('time:',time.clock())


0 comments on commit eebe58f

Please sign in to comment.