Skip to content

Commit

Permalink
chore: update basic spider template
Browse files Browse the repository at this point in the history
  • Loading branch information
shengchenyang committed Dec 25, 2023
1 parent f399a97 commit 91ad948
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 33 deletions.
9 changes: 8 additions & 1 deletion ayugespidertools/common/typevars.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Define your TypeVar here
import threading
from dataclasses import dataclass, field
from typing import List, Literal, NamedTuple, Optional, TypeVar, Union
from typing import TYPE_CHECKING, List, Literal, NamedTuple, Optional, TypeVar, Union

from sqlalchemy import create_engine

Expand All @@ -16,6 +16,13 @@
"SCRAM-SHA-1", "SCRAM-SHA-256", "MONGODB-CR", "MONGODB-X509", "PLAIN"
]

if TYPE_CHECKING:
from scrapy.http.response.html import HtmlResponse
from scrapy.http.response.text import TextResponse
from scrapy.http.response.xml import XmlResponse

ScrapyResponse = Union[TextResponse, HtmlResponse, XmlResponse]


class DatabaseSingletonMeta(type):
_instances = {}
Expand Down
53 changes: 21 additions & 32 deletions ayugespidertools/templates/spiders/basic.tmpl
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from scrapy.http import Request
from scrapy.http.response.text import TextResponse
from ayugespidertools.spiders import AyuSpider
from ayugespidertools.common.utils import ToolsForAyu
import json
from typing import TYPE_CHECKING

from ayugespidertools.items import AyuItem, DataItem
from ayugespidertools.spiders import AyuSpider
from scrapy.http import Request
from sqlalchemy import text

if TYPE_CHECKING:
from ayugespidertools.common.typevars import ScrapyResponse


class $classname(AyuSpider):
name = "$name"
Expand Down Expand Up @@ -35,34 +39,19 @@ class $classname(AyuSpider):
dont_filter=True
)

def parse_first(self, response: TextResponse, curr_site: str):
def parse_first(self, response: "ScrapyResponse", curr_site: str):
self.slog.info(f"当前采集的站点为: {curr_site}")
_save_table = "_article_info_list"

# 可自定义解析规则
data_list = ToolsForAyu.extract_with_json(json_data=response.json(), query="data")
data_list = json.loads(response.text)["data"]
for curr_data in data_list:
article_detail_url = ToolsForAyu.extract_with_json(
json_data=curr_data,
query="articleDetailUrl")

article_title = ToolsForAyu.extract_with_json(
json_data=curr_data,
query="articleTitle")

comment_count = ToolsForAyu.extract_with_json(
json_data=curr_data,
query="commentCount")

favor_count = ToolsForAyu.extract_with_json(
json_data=curr_data,
query="favorCount")

nick_name = ToolsForAyu.extract_with_json(
json_data=curr_data,
query="nickName")
article_detail_url = curr_data.get("articleDetailUrl")
article_title = curr_data.get("articleTitle")
comment_count = curr_data.get("commentCount")
favor_count = curr_data.get("favorCount")
nick_name = curr_data.get("nickName")

_save_table = "_article_info_list"
ArticleInfoItem = AyuItem(
article_item = AyuItem(
article_detail_url=DataItem(article_detail_url, "文章详情链接"),
article_title=DataItem(article_title, "文章标题"),
comment_count=DataItem(comment_count, "文章评论数量"),
Expand All @@ -72,7 +61,7 @@ class $classname(AyuSpider):
# 可选参数:这里表示 MongoDB 存储场景以 article_detail_url 为去重规则,若存在则更新,不存在则新增
_mongo_update_rule={"article_detail_url": article_detail_url},
)
self.slog.info(f"ArticleInfoItem: {ArticleInfoItem}")
self.slog.info(f"article_item: {article_item}")

# 注意:同时存储至 mysql 和 mongodb 时,不建议使用以下去重方法,会互相影响。
# 此时更适合:
Expand All @@ -88,11 +77,11 @@ class $classname(AyuSpider):
result = self.mysql_engine_conn.execute(_sql).fetchone()
if not result:
self.mysql_engine_conn.rollback()
yield ArticleInfoItem
yield article_item
else:
self.slog.debug(f'标题为 "{article_title}" 的数据已存在')
except Exception as e:
self.mysql_engine_conn.rollback()
yield ArticleInfoItem
yield article_item
else:
yield ArticleInfoItem
yield article_item

0 comments on commit 91ad948

Please sign in to comment.