Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
df59a42
commit eebe58f
Showing
2 changed files
with
90 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,10 @@ | ||
UrlCrawler | ||
author :gengyanqing | ||
author : saucerman | ||
description: python写的全站url爬取脚本,爬取网站的全部url用于分析站点目录 | ||
|
||
python写的全站url爬取脚本,爬取网站的全部url用于分析站点目录 | ||
## 说明 | ||
|
||
主要的实现方法是循环,具体步骤看下图: | ||
暂未使用多进程或多线程,速度不快,可自行修改源码 | ||
|
||
![这里写图片描述](https://wx3.sinaimg.cn/mw690/005GjT4tgy1fqq62nulyej30ce0kwjs2.jpg) | ||
|
||
此方法只能爬取网页链接性的url,不能爬取js动态生成的url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,146 +1,101 @@ | ||
# author: saucer_man | ||
# date:2018-04-24 | ||
# python3.6 | ||
|
||
# /usr/bin/env python3 | ||
# _*_ coding:utf-8 _*_ | ||
# auther: saucerman | ||
# project: https://github.com/saucer-man/UrlCrawler | ||
|
||
""" | ||
decription : 全站url爬取脚本 | ||
""" | ||
import re | ||
import time | ||
import sys | ||
import requests | ||
try : | ||
import tldextract | ||
except: | ||
print('module tldextract not fount \nyou try pip install tldextract') | ||
sys.exit() | ||
|
||
|
||
# 获取并检验要爬取的网站 | ||
def url_get(): | ||
url=input("please input the url:") | ||
def domain_get(): | ||
''' | ||
接收要爬取的网站url | ||
''' | ||
url = input("Please input the url of website:") | ||
if '//' not in url: | ||
url = 'http://' + url | ||
try: | ||
kv={'user_agent':'Mozilla/5.0'} | ||
requests.get(url,headers=kv) | ||
kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} | ||
requests.head(url,headers=kv) | ||
return url | ||
except: | ||
print("your url is incorrect!!") | ||
return url_get() | ||
|
||
''' | ||
找出url中的域名 | ||
比如从https://www.xiaogeng.top/article/page/id=3筛选出www.xiaogeng.top | ||
''' | ||
def url_same(url): | ||
|
||
#判断输入的网站使用的是https还是http | ||
urlprotocol=re.findall(r'.*(?=://)',url)[0] | ||
print('该站使用的协议是:' + urlprotocol) | ||
|
||
if len(re.findall(r'/',url)) >2: | ||
if urlprotocol=='https': | ||
sameurl = re.findall(r'(?<=https://).*?(?=/)', url)[0] | ||
else: | ||
sameurl = re.findall(r'(?<=http://).*?(?=/)', url)[0] | ||
else: | ||
url = url + '/' | ||
if urlprotocol=='https': | ||
sameurl = re.findall(r'(?<=https://).*?(?=/)',url)[0] | ||
else: | ||
sameurl = re.findall(r'(?<=http://).*?(?=/)',url)[0] | ||
|
||
print('域名地址:' + sameurl) | ||
return sameurl | ||
return domain_get() | ||
|
||
|
||
# 爬取url页面中的所有链接 | ||
def spiderpage(url): | ||
kv={'user_agent':'Mozilla/5.0'} | ||
r=requests.get(url,headers=kv) | ||
r.encoding=r.apparent_encoding | ||
pagetext=r.text | ||
pagelinks = re.findall(r'(?<=href=\").*?(?=\")|(?<=href=\').*?(?=\')',pagetext) | ||
return pagelinks | ||
class spider(): | ||
def __init__(self, domain, key, depth): | ||
self.domain = domain # 爬取的域名 | ||
self.depth = depth # 爬取的深度 | ||
self.urls_all = set([]) # 爬取的结果 | ||
self.key = key # 顶级域名,用于排除外链 | ||
|
||
#筛选pagelinks中的url | ||
def url_filtrate(pagelinks): | ||
''' | ||
print("我现在在筛选") | ||
''' | ||
#去除不是该站点的url | ||
same_target_url = [] | ||
for l in pagelinks: | ||
if re.findall(sameurl,l): | ||
same_target_url.append(l) | ||
#去除重复url | ||
unrepect_url = [] | ||
for l in same_target_url: | ||
if l not in unrepect_url: | ||
unrepect_url.append(l) | ||
return unrepect_url | ||
#将一个列表写入文件 | ||
def writetofile(list): | ||
file=open('urls.txt','w') | ||
for url in list: | ||
file.write(url) | ||
file.write('\n') | ||
file.close() | ||
|
||
# url集合,循环遍历会用到 | ||
class linkQuence: | ||
def __init__(self): | ||
#已访问的url集合 | ||
self.visited=[] | ||
#待访问的url集合 | ||
self.unvisited=[] | ||
#获取访问过的url队列 | ||
def getvisitedurl(self): | ||
return self.visited | ||
#获取未访问的url队列 | ||
def getunvisitedurl(self): | ||
return self.unvisited | ||
#添加url到访问过得队列中 | ||
def addvisitedurl(self,url): | ||
return self.visited.append(url) | ||
#移除访问过得url | ||
def removevisitedurl(self,url): | ||
return self.visited.remove(url) | ||
#从未访问队列中取一个url | ||
def unvisitedurldequence(self): | ||
try: | ||
return self.unvisited.pop() | ||
except: | ||
return None | ||
#添加url到未访问的队列中 | ||
def addunvisitedurl(self,url): | ||
if url!="" and url not in self.visited and url not in self.unvisited: | ||
return self.unvisited.insert(0,url) | ||
#获得已访问的url数目 | ||
def getvisitedurlount(self): | ||
return len(self.visited) | ||
#获得未访问的url数目 | ||
def getunvistedurlcount(self): | ||
return len(self.unvisited) | ||
#判断未访问的url队列是否为空 | ||
def unvisitedurlsempty(self): | ||
return len(self.unvisited)==0 | ||
def page_spider(self, url): | ||
''' | ||
爬取url中的所有链接 | ||
''' | ||
try: | ||
kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} | ||
r = requests.get(url, headers=kv, timeout = 2) | ||
r.encoding=r.apparent_encoding | ||
pagetext=r.text | ||
pagelinks = re.findall(r'(?<=href=\").*?(?=\")|(?<=href=\').*?(?=\')',pagetext) | ||
|
||
except: | ||
return set([]) | ||
# 接下来对爬取的链接进行处理 | ||
|
||
# 1、先去除不同域的链接 | ||
url_list = set([]) | ||
for url in pagelinks: | ||
if self.key in url: | ||
url_list.add(url) | ||
|
||
# 2、再对链接进行去重处理 | ||
url_list = set(url_list)-self.urls_all | ||
self.urls_all.update(url_list) | ||
return url_list # 返回集合 | ||
|
||
|
||
|
||
def run(self): | ||
url_list = set([self.domain]) # 第一次爬取原始url的链接 | ||
while self.depth >= 1: # 每一次深度的爬取都会爬取url_list的所有链接 | ||
print("倒数第%d轮"%self.depth) | ||
url_list_tmp = set([]) | ||
for url in url_list: | ||
url_list_tmp.update(self.page_spider(url)) | ||
url_list = url_list_tmp | ||
self.depth = self.depth -1 | ||
|
||
# 真正的爬取函数 | ||
class Spider(): | ||
def __init__(self,url): | ||
self.linkQuence = linkQuence() #引入linkQuence类 | ||
self.linkQuence.addunvisitedurl(url) #并将需要爬取的url添加进linkQuence对列中 | ||
|
||
def crawler(self): | ||
while not self.linkQuence.unvisitedurlsempty():# 若未访问队列非空 | ||
print("嘀嘀嘀我又爬到一个") | ||
visitedurl = self.linkQuence.unvisitedurldequence()# 取一个url | ||
if visitedurl is None or visitedurl == '': | ||
continue | ||
initial_links=spiderpage(visitedurl) # 爬出该url页面中所有的链接 | ||
right_links = url_filtrate(initial_links) # 筛选出合格的链接 | ||
self.linkQuence.addvisitedurl(visitedurl) # 将该url放到访问过的url队列中 | ||
for link in right_links: # 将筛选出的链接放到未访问队列中 | ||
self.linkQuence.addunvisitedurl(link) | ||
# print(self.linkQuence.visited) | ||
print("哥我爬完了") | ||
return self.linkQuence.visited | ||
file=open('result.txt','w') | ||
for url in self.urls_all: | ||
file.write(url) | ||
file.write('\n') | ||
file.close() | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
url=url_get() | ||
sameurl=url_same(url) | ||
spider=Spider(url) | ||
urllist=spider.crawler() | ||
writetofile(urllist) | ||
|
||
time.clock() | ||
domain = domain_get() | ||
print('domain:', domain) | ||
key = tldextract.extract(domain).domain # 获取顶级域名 'https://xiaogeng.top'-->'xiaogeng' | ||
print('key:', key) | ||
spider = spider(domain = domain, key = key, depth = 3) | ||
spider.run() | ||
print('结果已保存至result.txt中') | ||
print('time:',time.clock()) | ||
|
||
|