/
ProxyCrawl.py
96 lines (82 loc) · 2.96 KB
/
ProxyCrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# coding: utf-8
import gevent
from gevent.pool import Pool
from multiprocessing import Queue, Process, Value
import time
import sys
from api.apiServer import start_api_server
from config import THREADNUM, parserList, UPDATE_TIME, MINNUM
from db.DataStore import store_data, sqlhelper
from spider.HtmlDownLoader import Html_Downloader
from spider.HtmlPraser import Html_Parser
from validator.Validator import validator, getMyIP, detect_from_db
from gevent import monkey
monkey.patch_all()
__author__ = 'qiye'
'''
这个类的作用是描述爬虫的逻辑
'''
def startProxyCrawl(queue, db_proxy_num):
crawl = ProxyCrawl(queue, db_proxy_num)
crawl.run()
class ProxyCrawl(object):
proxies = set()
def __init__(self, queue, db_proxy_num):
self.crawl_pool = Pool(THREADNUM)
self.queue = queue
self.db_proxy_num = db_proxy_num
def run(self):
while True:
self.proxies.clear()
str = u'IPProxyPool----->>>>>>>>beginning'
sys.stdout.write(str + "\r\n")
sys.stdout.flush()
proxylist = sqlhelper.select()
myip = getMyIP()
spawns = []
for proxy in proxylist:
spawns.append(
gevent.spawn(detect_from_db, myip, proxy, self.proxies)
)
gevent.joinall(spawns)
self.db_proxy_num.value = len(self.proxies)
str = u'IPProxyPool----->>>>>>>>db exists ip:%d' % len(
self.proxies)
if len(self.proxies) < MINNUM:
str += u'\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
sys.stdout.write(str + "\r\n")
sys.stdout.flush()
self.crawl_pool.map(self.crawl, parserList)
else:
str += u'\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
sys.stdout.write(str + "\r\n")
sys.stdout.flush()
time.sleep(UPDATE_TIME)
def crawl(self, parser):
html_parser = Html_Parser()
for url in parser['urls']:
response = Html_Downloader.download(url)
if not response:
continue
proxylist = html_parser.parse(response, parser)
if not proxylist:
continue
for proxy in proxylist:
proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
if proxy_str not in self.proxies:
self.proxies.add(proxy_str)
self.queue.put(proxy)
if __name__ == "__main__":
DB_PROXY_NUM = Value('i', 0)
q1 = Queue()
q2 = Queue()
p0 = Process(target=start_api_server)
p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM))
p2 = Process(target=validator, args=(q1, q2))
p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
p0.start()
p1.start()
p2.start()
p3.start()
# spider = ProxyCrawl()
# spider.run()