-
Notifications
You must be signed in to change notification settings - Fork 11
/
crawl.tmpl
36 lines (31 loc) · 1.2 KB
/
crawl.tmpl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from ayugespidertools.items import AyuItem
from ayugespidertools.spiders import AyuCrawlSpider
class $classname(AyuCrawlSpider):
name = "$name"
allowed_domains = ["$domain"]
start_urls = ["http://$domain/"]
custom_settings = {
"ITEM_PIPELINES": {
"ayugespidertools.pipelines.AyuFtyMysqlPipeline": 300,
},
"DOWNLOADER_MIDDLEWARES": {
"ayugespidertools.middlewares.RandomRequestUaMiddleware": 400,
},
}
rules = (
# Rule(LinkExtractor(allow=r"Items/"), callback="parse_item", follow=True),
Rule(LinkExtractor(restrict_xpaths='//div[@class="rank_d_b_name"]/a'), callback="parse_item"),
)
def parse_item(self, response):
# 获取图书名称 - (获取的是详情页中的图书名称)
book_name_list = response.xpath('//div[@class="book-name"]//text()').extract()
book_name = "".join(book_name_list).strip()
self.slog.debug(f"book_name: {book_name}")
NovelInfoItem = AyuItem(
book_name=book_name,
_table="_article_info_list",
)
yield NovelInfoItem