Skip to content

Commit 5f4a7ad

Browse files
committed
add WebSpiderSeed/fetch_worker
1 parent f83b6ae commit 5f4a7ad

File tree

1 file changed

+50
-0
lines changed

1 file changed

+50
-0
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
'''
2+
Author: herongwei
3+
Date: 2021-06-27 09:54:36
4+
LastEditTime: 2021-06-27 10:20:22
5+
LastEditors: Please set LastEditors
6+
Description: 核心工作类
7+
FilePath: /WebSpiderSeed/bin/fetch_woker.py
8+
'''
9+
#!/usr/bin/spython
10+
# -*- coding: utf-8 -*-
11+
12+
from comlib.io_helper import IOHelper
13+
import json
14+
import traceback
15+
import multiprocessing
16+
from multiprocessing import Manager
17+
from comlib.req_helper import ReqHelper
18+
from lxml import etree
19+
from urllib.parse import urljoin
20+
import time
21+
from lib.common_helper import CommonHelper
22+
from lib.log_helper import LogHelper
23+
from lib.config_helper import ConfigHelper
24+
25+
def do_get_all_category_urls(seed_url_item, seed_url, curr_ua, data, process_index):
26+
try:
27+
web_name = data['web_name']
28+
use_proxy = data.get('use_proxy', 0)
29+
seed_url_category_urls = []
30+
if seed_url_item["type"] == "category_multi_page":
31+
if seed_url_item["xpath_str_get_category"] == "is_full_category_url":
32+
seed_url_category_urls.append(seed_url)#当前页就是全部页
33+
elif seed_url_item['xpath_str_get_category'] == 'is_fix_category_url':
34+
seed_url_category_urls.append(seed_url)
35+
seed_url_category_urls.extend(seed_url_item['fix_category_urls'])
36+
seed_url_category_urls = list(set(seed_url_category_urls))
37+
else:
38+
seed_url_html = ReqHelper.get_html(seed_url, curr_ua, use_proxy)
39+
if seed_url_html:
40+
selector = etree.HTML(seed_url_html)
41+
temp_category_urls = selector.xpath(seed_url_item['xpath_str_get_category'])
42+
for key in temp_category_urls:
43+
seed_url_category_urls.append(urljoin(seed_url, key))
44+
LogHelper.info("ing_do_search_%s time:%s thread_index:%s category_urls:%s", web_name,
45+
CommonHelper.get_time_millis(), process_index, seed_url_category_urls)
46+
except Exception as e:
47+
LogHelper.error("do_get_all_category_urls url:%s err:%s trace_back:%s", seed_url, e, traceback.format_exc())
48+
49+
return seed_url_category_urls
50+

0 commit comments

Comments
 (0)