From aed67a900ee4b047ebfd19bb1391d0a9c37f388b Mon Sep 17 00:00:00 2001 From: Mohammadtaher Abbasi Date: Fri, 3 Feb 2023 02:21:02 +0330 Subject: [PATCH 1/5] fix bug of parse command; fixes #5819 --- scrapy/utils/spider.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapy/utils/spider.py b/scrapy/utils/spider.py index 86449eeb2aa..e9cc9f62f53 100644 --- a/scrapy/utils/spider.py +++ b/scrapy/utils/spider.py @@ -2,6 +2,7 @@ import logging from scrapy.spiders import Spider +from scrapy.utils.asyncgen import collect_asyncgen from scrapy.utils.defer import deferred_from_coro from scrapy.utils.misc import arg_to_iter @@ -10,7 +11,9 @@ def iterate_spider_output(result): if inspect.isasyncgen(result): - return result + d = deferred_from_coro(collect_asyncgen(result)) + d.addCallback(iterate_spider_output) + return d if inspect.iscoroutine(result): d = deferred_from_coro(result) d.addCallback(iterate_spider_output) From f4503609b52aa7f5bc798f19c1ea96057d701744 Mon Sep 17 00:00:00 2001 From: Mohammadtaher Abbasi Date: Fri, 3 Feb 2023 23:00:38 +0330 Subject: [PATCH 2/5] add tests for async parse command --- scrapy/commands/parse.py | 26 ++++++++- scrapy/utils/spider.py | 5 +- tests/test_command_parse.py | 110 ++++++++++++++++++++++++++++++++++-- 3 files changed, 129 insertions(+), 12 deletions(-) diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py index 9c3fc86d48c..fdbd0c0908d 100644 --- a/scrapy/commands/parse.py +++ b/scrapy/commands/parse.py @@ -1,3 +1,4 @@ +import inspect import json import logging from typing import Dict @@ -10,7 +11,11 @@ from scrapy.exceptions import UsageError from scrapy.http import Request from scrapy.utils import display -from scrapy.utils.spider import iterate_spider_output, spidercls_for_request +from scrapy.utils.asyncgen import collect_asyncgen +from scrapy.utils.defer import aiter_errback, deferred_from_coro +from scrapy.utils.log import failure_to_exc_info +from scrapy.utils.misc import arg_to_iter +from scrapy.utils.spider import spidercls_for_request logger = logging.getLogger(__name__) @@ -108,6 +113,23 @@ def max_level(self): max_requests = max(self.requests) return max(max_items, max_requests) + def handle_exception(self, _failure): + # Incomplete message + logger.error("", exc_info=failure_to_exc_info(_failure)) + + def iterate_spider_output(self, result): + if inspect.isasyncgen(result): + d = deferred_from_coro( + collect_asyncgen(aiter_errback(result, self.handle_exception)) + ) + d.addCallback(self.iterate_spider_output) + return d + if inspect.iscoroutine(result): + d = deferred_from_coro(result) + d.addCallback(self.iterate_spider_output) + return d + return arg_to_iter(deferred_from_coro(result)) + def add_items(self, lvl, new_items): old_items = self.items.get(lvl, []) self.items[lvl] = old_items + new_items @@ -165,7 +187,7 @@ def _get_items_and_requests(self, spider_output, opts, depth, spider, callback): def run_callback(self, response, callback, cb_kwargs=None): cb_kwargs = cb_kwargs or {} - d = maybeDeferred(iterate_spider_output, callback(response, **cb_kwargs)) + d = maybeDeferred(self.iterate_spider_output, callback(response, **cb_kwargs)) return d def get_callback_from_rules(self, spider, response): diff --git a/scrapy/utils/spider.py b/scrapy/utils/spider.py index e9cc9f62f53..86449eeb2aa 100644 --- a/scrapy/utils/spider.py +++ b/scrapy/utils/spider.py @@ -2,7 +2,6 @@ import logging from scrapy.spiders import Spider -from scrapy.utils.asyncgen import collect_asyncgen from scrapy.utils.defer import deferred_from_coro from scrapy.utils.misc import arg_to_iter @@ -11,9 +10,7 @@ def iterate_spider_output(result): if inspect.isasyncgen(result): - d = deferred_from_coro(collect_asyncgen(result)) - d.addCallback(iterate_spider_output) - return d + return result if inspect.iscoroutine(result): d = deferred_from_coro(result) d.addCallback(iterate_spider_output) diff --git a/tests/test_command_parse.py b/tests/test_command_parse.py index b0fb978e952..167b6f9ad57 100644 --- a/tests/test_command_parse.py +++ b/tests/test_command_parse.py @@ -30,14 +30,53 @@ def setUp(self): from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy.utils.test import get_from_asyncio_queue +import asyncio -class AsyncDefAsyncioSpider(scrapy.Spider): - name = 'asyncdef{self.spider_name}' +class AsyncDefAsyncioReturnSpider(scrapy.Spider): + name = "asyncdef_asyncio_return" async def parse(self, response): + await asyncio.sleep(0.2) status = await get_from_asyncio_queue(response.status) - return [scrapy.Item(), dict(foo='bar')] + self.logger.info(f"Got response {{status}}") + return [{{'id': 1}}, {{'id': 2}}] + +class AsyncDefAsyncioReturnSingleElementSpider(scrapy.Spider): + name = "asyncdef_asyncio_return_single_element" + + async def parse(self, response): + await asyncio.sleep(0.1) + status = await get_from_asyncio_queue(response.status) + self.logger.info(f"Got response {{status}}") + return {{'foo': 42}} + +class AsyncDefAsyncioGenLoopSpider(scrapy.Spider): + name = "asyncdef_asyncio_gen_loop" + + async def parse(self, response): + for i in range(10): + await asyncio.sleep(0.1) + yield {{'foo': i}} + self.logger.info(f"Got response {{response.status}}") + +class AsyncDefAsyncioSpider(scrapy.Spider): + name = "asyncdef_asyncio" + + async def parse(self, response): + await asyncio.sleep(0.2) + status = await get_from_asyncio_queue(response.status) + self.logger.debug(f"Got response {{status}}") + +class AsyncDefAsyncioGenExcSpider(scrapy.Spider): + name = "asyncdef_asyncio_gen_exc" + + async def parse(self, response): + for i in range(10): + await asyncio.sleep(0.1) + yield {{'foo': i}} + if i > 5: + raise ValueError("Stopping the processing") class MySpider(scrapy.Spider): name = '{self.spider_name}' @@ -213,17 +252,76 @@ def test_pipelines(self): self.assertIn("INFO: It Works!", _textmode(stderr)) @defer.inlineCallbacks - def test_asyncio_parse_items(self): + def test_async_def_asyncio_parse_items_list(self): status, out, stderr = yield self.execute( [ "--spider", - "asyncdef" + self.spider_name, + "asyncdef_asyncio_return", "-c", "parse", self.url("/html"), ] ) - self.assertIn("""[{}, {'foo': 'bar'}]""", _textmode(out)) + self.assertIn("INFO: Got response 200", _textmode(stderr)) + self.assertIn("{'id': 1}", _textmode(out)) + self.assertIn("{'id': 2}", _textmode(out)) + + @defer.inlineCallbacks + def test_async_def_asyncio_parse_items_single_element(self): + status, out, stderr = yield self.execute( + [ + "--spider", + "asyncdef_asyncio_return_single_element", + "-c", + "parse", + self.url("/html"), + ] + ) + self.assertIn("INFO: Got response 200", _textmode(stderr)) + self.assertIn("{'foo': 42}", _textmode(out)) + + @defer.inlineCallbacks + def test_async_def_asyncgen_parse_loop(self): + status, out, stderr = yield self.execute( + [ + "--spider", + "asyncdef_asyncio_gen_loop", + "-c", + "parse", + self.url("/html"), + ] + ) + self.assertIn("INFO: Got response 200", _textmode(stderr)) + for i in range(10): + self.assertIn(f"{{'foo': {i}}}", _textmode(out)) + + @defer.inlineCallbacks + def test_async_def_asyncgen_parse_exc(self): + status, out, stderr = yield self.execute( + [ + "--spider", + "asyncdef_asyncio_gen_exc", + "-c", + "parse", + self.url("/html"), + ] + ) + self.assertIn("ValueError", _textmode(stderr)) + for i in range(7): + self.assertIn(f"{{'foo': {i}}}", _textmode(out)) + + @defer.inlineCallbacks + def test_async_def_asyncio_parse(self): + _, _, stderr = yield self.execute( + [ + "--spider", + "asyncdef_asyncio", + "-c", + "parse", + self.url("/html"), + ] + ) + self.assertIn("DEBUG: Got response 200", _textmode(stderr)) @defer.inlineCallbacks def test_parse_items(self): From 933e976a374d48bdbc04ef2e5f5e586d11ab5161 Mon Sep 17 00:00:00 2001 From: Mohammadtaher Abbasi Date: Sat, 4 Feb 2023 11:49:11 +0330 Subject: [PATCH 3/5] remove whitespaces --- tests/test_command_parse.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_command_parse.py b/tests/test_command_parse.py index 167b6f9ad57..037333c03af 100644 --- a/tests/test_command_parse.py +++ b/tests/test_command_parse.py @@ -30,7 +30,7 @@ def setUp(self): from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy.utils.test import get_from_asyncio_queue -import asyncio +import asyncio class AsyncDefAsyncioReturnSpider(scrapy.Spider): @@ -41,7 +41,7 @@ async def parse(self, response): status = await get_from_asyncio_queue(response.status) self.logger.info(f"Got response {{status}}") return [{{'id': 1}}, {{'id': 2}}] - + class AsyncDefAsyncioReturnSingleElementSpider(scrapy.Spider): name = "asyncdef_asyncio_return_single_element" @@ -50,7 +50,7 @@ async def parse(self, response): status = await get_from_asyncio_queue(response.status) self.logger.info(f"Got response {{status}}") return {{'foo': 42}} - + class AsyncDefAsyncioGenLoopSpider(scrapy.Spider): name = "asyncdef_asyncio_gen_loop" @@ -59,9 +59,9 @@ async def parse(self, response): await asyncio.sleep(0.1) yield {{'foo': i}} self.logger.info(f"Got response {{response.status}}") - + class AsyncDefAsyncioSpider(scrapy.Spider): - name = "asyncdef_asyncio" + name = "asyncdef_asyncio" async def parse(self, response): await asyncio.sleep(0.2) From c2b4b3906b494143dd5fa3298eb33dd4116f00e5 Mon Sep 17 00:00:00 2001 From: Mohammadtaher Abbasi Date: Sun, 5 Feb 2023 09:19:49 +0330 Subject: [PATCH 4/5] rerun checks From 0ceebdf53c71bc23867abce4f557fdb0f560b8de Mon Sep 17 00:00:00 2001 From: Mohammadtaher Abbasi Date: Sun, 5 Mar 2023 10:12:02 +0330 Subject: [PATCH 5/5] add log message --- scrapy/commands/parse.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py index fdbd0c0908d..ac937e46495 100644 --- a/scrapy/commands/parse.py +++ b/scrapy/commands/parse.py @@ -114,8 +114,10 @@ def max_level(self): return max(max_items, max_requests) def handle_exception(self, _failure): - # Incomplete message - logger.error("", exc_info=failure_to_exc_info(_failure)) + logger.error( + "An error is caught while iterating the async iterable", + exc_info=failure_to_exc_info(_failure), + ) def iterate_spider_output(self, result): if inspect.isasyncgen(result):