|
| 1 | +''' |
| 2 | +Author: herongwei |
| 3 | +Date: 2021-06-27 09:54:36 |
| 4 | +LastEditTime: 2021-06-27 10:20:22 |
| 5 | +LastEditors: Please set LastEditors |
| 6 | +Description: 核心工作类 |
| 7 | +FilePath: /WebSpiderSeed/bin/fetch_woker.py |
| 8 | +''' |
| 9 | +#!/usr/bin/spython |
| 10 | +# -*- coding: utf-8 -*- |
| 11 | + |
| 12 | +from comlib.io_helper import IOHelper |
| 13 | +import json |
| 14 | +import traceback |
| 15 | +import multiprocessing |
| 16 | +from multiprocessing import Manager |
| 17 | +from comlib.req_helper import ReqHelper |
| 18 | +from lxml import etree |
| 19 | +from urllib.parse import urljoin |
| 20 | +import time |
| 21 | +from lib.common_helper import CommonHelper |
| 22 | +from lib.log_helper import LogHelper |
| 23 | +from lib.config_helper import ConfigHelper |
| 24 | + |
| 25 | +def do_get_all_category_urls(seed_url_item, seed_url, curr_ua, data, process_index): |
| 26 | + try: |
| 27 | + web_name = data['web_name'] |
| 28 | + use_proxy = data.get('use_proxy', 0) |
| 29 | + seed_url_category_urls = [] |
| 30 | + if seed_url_item["type"] == "category_multi_page": |
| 31 | + if seed_url_item["xpath_str_get_category"] == "is_full_category_url": |
| 32 | + seed_url_category_urls.append(seed_url)#当前页就是全部页 |
| 33 | + elif seed_url_item['xpath_str_get_category'] == 'is_fix_category_url': |
| 34 | + seed_url_category_urls.append(seed_url) |
| 35 | + seed_url_category_urls.extend(seed_url_item['fix_category_urls']) |
| 36 | + seed_url_category_urls = list(set(seed_url_category_urls)) |
| 37 | + else: |
| 38 | + seed_url_html = ReqHelper.get_html(seed_url, curr_ua, use_proxy) |
| 39 | + if seed_url_html: |
| 40 | + selector = etree.HTML(seed_url_html) |
| 41 | + temp_category_urls = selector.xpath(seed_url_item['xpath_str_get_category']) |
| 42 | + for key in temp_category_urls: |
| 43 | + seed_url_category_urls.append(urljoin(seed_url, key)) |
| 44 | + LogHelper.info("ing_do_search_%s time:%s thread_index:%s category_urls:%s", web_name, |
| 45 | + CommonHelper.get_time_millis(), process_index, seed_url_category_urls) |
| 46 | + except Exception as e: |
| 47 | + LogHelper.error("do_get_all_category_urls url:%s err:%s trace_back:%s", seed_url, e, traceback.format_exc()) |
| 48 | + |
| 49 | + return seed_url_category_urls |
| 50 | + |
0 commit comments