From 03896a419ef66ad3c170604d16c6f0d388d21417 Mon Sep 17 00:00:00 2001 From: Heorhii Korzh Date: Thu, 16 Nov 2023 13:55:47 +0200 Subject: [PATCH 1/4] Handling CloseSpider exception if it has been raised in start_requests() --- scrapy/core/engine.py | 15 +++++++++++++-- scrapy/crawler.py | 9 ++++++--- tests/spiders.py | 27 ++++++++++++++++++++++++++- tests/test_closespider.py | 27 ++++++++++++++++++++++++++- 4 files changed, 71 insertions(+), 7 deletions(-) diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index dd1f56f8cc3..0287fdd7fb7 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -141,7 +141,7 @@ def _finish_stopping_engine(_: Any) -> Generator[Deferred, Any, None]: ) return dfd.addBoth(_finish_stopping_engine) - def close(self) -> Deferred: + def close(self, close_spider_reason: str = None) -> Deferred: """ Gracefully close the execution engine. If it has already been started, stop it. In all cases, close the spider and the downloader. @@ -150,7 +150,7 @@ def close(self) -> Deferred: return self.stop() # will also close spider and downloader if self.spider is not None: return self.close_spider( - self.spider, reason="shutdown" + self.spider, reason=close_spider_reason or "shutdown" ) # will also close downloader self.downloader.close() return succeed(None) @@ -400,6 +400,17 @@ def _spider_idle(self) -> None: assert isinstance(ex, CloseSpider) # typing self.close_spider(self.spider, reason=ex.reason) + def close_spider_on_start(self, spider: Spider, reason: str = "cancelled"): + if self.slot is not None: + raise RuntimeError("Engine slot is already assigned. Use self.close_spider") + + nextcall_none = CallLaterOnce(lambda: None) + scheduler = create_instance( + self.scheduler_cls, settings=None, crawler=self.crawler + ) + self.slot = Slot((), True, nextcall_none, scheduler) + self.close_spider(spider, reason) + def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred: """Close (cancel) spider and clear all its outstanding requests""" if self.slot is None: diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 1d3a1120839..0a8bbf99c04 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -24,7 +24,7 @@ from scrapy import Spider, signals from scrapy.addons import AddonManager from scrapy.core.engine import ExecutionEngine -from scrapy.exceptions import ScrapyDeprecationWarning +from scrapy.exceptions import CloseSpider, ScrapyDeprecationWarning from scrapy.extension import ExtensionManager from scrapy.interfaces import ISpiderLoader from scrapy.logformatter import LogFormatter @@ -158,11 +158,14 @@ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]: start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield maybeDeferred(self.engine.start) - except Exception: + except Exception as e: self.crawling = False + if isinstance(e, CloseSpider): + self.engine.close_spider_on_start(self.spider, reason=e.reason) + return None if self.engine is not None: yield self.engine.close() - raise + raise e def _create_spider(self, *args: Any, **kwargs: Any) -> Spider: return self.spidercls.from_crawler(self, *args, **kwargs) diff --git a/tests/spiders.py b/tests/spiders.py index f29dea2a12b..00ea32314c2 100644 --- a/tests/spiders.py +++ b/tests/spiders.py @@ -3,12 +3,13 @@ """ import asyncio import time +from typing import Iterable from urllib.parse import urlencode from twisted.internet import defer from scrapy import signals -from scrapy.exceptions import StopDownload +from scrapy.exceptions import CloseSpider, StopDownload from scrapy.http import Request from scrapy.item import Item from scrapy.linkextractors import LinkExtractor @@ -276,6 +277,30 @@ def parse(self, response): self.raise_exception() +class CloseExceptionSpider(FollowAllSpider): + _expected_message: str = None + + def __init__(self, *args, **kwargs): + self._expected_message = ( + kwargs["expected_message"] if "expected_message" in kwargs else "Error" + ) + super().__init__(*args, **kwargs) + + +class CloseExceptionStartSpider(CloseExceptionSpider): + def start_requests(self) -> Iterable[Request]: + raise CloseSpider(reason=self._expected_message) + + +class CloseExceptionParseSpider(CloseExceptionSpider): + def start_requests(self) -> Iterable[Request]: + url = self.mockserver.url("/close_spider") + yield Request(url, callback=self.parse) + + def parse(self, response): + raise CloseSpider(reason=self._expected_message) + + class BrokenStartRequestsSpider(FollowAllSpider): fail_before_yield = False fail_yielding = False diff --git a/tests/test_closespider.py b/tests/test_closespider.py index 38ede70e449..f2ca95dafa7 100644 --- a/tests/test_closespider.py +++ b/tests/test_closespider.py @@ -3,7 +3,14 @@ from scrapy.utils.test import get_crawler from tests.mockserver import MockServer -from tests.spiders import ErrorSpider, FollowAllSpider, ItemSpider, SlowSpider +from tests.spiders import ( + CloseExceptionParseSpider, + CloseExceptionStartSpider, + ErrorSpider, + FollowAllSpider, + ItemSpider, + SlowSpider, +) class TestCloseSpider(TestCase): @@ -64,3 +71,21 @@ def test_closespider_timeout_no_item(self): self.assertEqual(reason, "closespider_timeout_no_item") total_seconds = crawler.stats.get_value("elapsed_time_seconds") self.assertTrue(total_seconds >= timeout) + + @defer.inlineCallbacks + def test_closespider_exception_handler(self): + expected_message_parse = "Raised on parse." + crawler_parse = get_crawler(CloseExceptionParseSpider) + yield crawler_parse.crawl( + mockserver=self.mockserver, expected_message=expected_message_parse + ) + reason_parse = crawler_parse.spider.meta["close_reason"] + self.assertEqual(reason_parse, expected_message_parse) + + expected_message_start = "Raised on start." + crawler_start = get_crawler(CloseExceptionStartSpider) + yield crawler_start.crawl( + mockserver=self.mockserver, expected_message=expected_message_start + ) + reason_start = crawler_start.spider.meta["close_reason"] + self.assertEqual(reason_start, expected_message_start) From 3bdb7352991793b68cd21a3a435f6d9971aecf6b Mon Sep 17 00:00:00 2001 From: Heorhii Korzh Date: Thu, 16 Nov 2023 14:15:50 +0200 Subject: [PATCH 2/4] polished changes. removed unneded change in ExecutionEngine.close --- scrapy/core/engine.py | 4 ++-- scrapy/crawler.py | 15 ++++++++------- tests/spiders.py | 7 +++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index 0287fdd7fb7..3ea52fcd04f 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -141,7 +141,7 @@ def _finish_stopping_engine(_: Any) -> Generator[Deferred, Any, None]: ) return dfd.addBoth(_finish_stopping_engine) - def close(self, close_spider_reason: str = None) -> Deferred: + def close(self) -> Deferred: """ Gracefully close the execution engine. If it has already been started, stop it. In all cases, close the spider and the downloader. @@ -150,7 +150,7 @@ def close(self, close_spider_reason: str = None) -> Deferred: return self.stop() # will also close spider and downloader if self.spider is not None: return self.close_spider( - self.spider, reason=close_spider_reason or "shutdown" + self.spider, reason="shutdown" ) # will also close downloader self.downloader.close() return succeed(None) diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 0a8bbf99c04..5878876af6b 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -155,17 +155,18 @@ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]: self._apply_settings() self._update_root_log_handler() self.engine = self._create_engine() - start_requests = iter(self.spider.start_requests()) - yield self.engine.open_spider(self.spider, start_requests) - yield maybeDeferred(self.engine.start) - except Exception as e: - self.crawling = False - if isinstance(e, CloseSpider): + try: + start_requests = iter(self.spider.start_requests()) + yield self.engine.open_spider(self.spider, start_requests) + yield maybeDeferred(self.engine.start) + except CloseSpider as e: self.engine.close_spider_on_start(self.spider, reason=e.reason) return None + except Exception: + self.crawling = False if self.engine is not None: yield self.engine.close() - raise e + raise def _create_spider(self, *args: Any, **kwargs: Any) -> Spider: return self.spidercls.from_crawler(self, *args, **kwargs) diff --git a/tests/spiders.py b/tests/spiders.py index 00ea32314c2..5a733046bfe 100644 --- a/tests/spiders.py +++ b/tests/spiders.py @@ -278,12 +278,11 @@ def parse(self, response): class CloseExceptionSpider(FollowAllSpider): - _expected_message: str = None + _expected_message: str = "Error" def __init__(self, *args, **kwargs): - self._expected_message = ( - kwargs["expected_message"] if "expected_message" in kwargs else "Error" - ) + if "expected_message" in kwargs: + self._expected_message = kwargs["expected_message"] super().__init__(*args, **kwargs) From 7b2d3e5f4c829ed43aab54643541a0e3b6843fe7 Mon Sep 17 00:00:00 2001 From: Heorhii Korzh Date: Thu, 16 Nov 2023 16:52:48 +0200 Subject: [PATCH 3/4] call engine.start on calling engine.close_spider_before_start --- scrapy/core/engine.py | 3 ++- scrapy/crawler.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index 3ea52fcd04f..bd40c9618fd 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -400,10 +400,11 @@ def _spider_idle(self) -> None: assert isinstance(ex, CloseSpider) # typing self.close_spider(self.spider, reason=ex.reason) - def close_spider_on_start(self, spider: Spider, reason: str = "cancelled"): + def close_spider_before_start(self, spider: Spider, reason: str = "cancelled"): if self.slot is not None: raise RuntimeError("Engine slot is already assigned. Use self.close_spider") + self.start() nextcall_none = CallLaterOnce(lambda: None) scheduler = create_instance( self.scheduler_cls, settings=None, crawler=self.crawler diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 5878876af6b..5104817bc38 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -160,7 +160,7 @@ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]: yield self.engine.open_spider(self.spider, start_requests) yield maybeDeferred(self.engine.start) except CloseSpider as e: - self.engine.close_spider_on_start(self.spider, reason=e.reason) + self.engine.close_spider_before_start(self.spider, reason=e.reason) return None except Exception: self.crawling = False From 7db2755d021f00e761d7969c22706ca7cff2f1b3 Mon Sep 17 00:00:00 2001 From: Heorhii Korzh Date: Fri, 17 Nov 2023 12:08:23 +0200 Subject: [PATCH 4/4] yield close_spider_before_start --- scrapy/core/engine.py | 2 +- scrapy/crawler.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index bd40c9618fd..beb88089459 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -410,7 +410,7 @@ def close_spider_before_start(self, spider: Spider, reason: str = "cancelled"): self.scheduler_cls, settings=None, crawler=self.crawler ) self.slot = Slot((), True, nextcall_none, scheduler) - self.close_spider(spider, reason) + return self.close_spider(spider, reason) def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred: """Close (cancel) spider and clear all its outstanding requests""" diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 5104817bc38..43c4aa79e86 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -160,8 +160,9 @@ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]: yield self.engine.open_spider(self.spider, start_requests) yield maybeDeferred(self.engine.start) except CloseSpider as e: - self.engine.close_spider_before_start(self.spider, reason=e.reason) - return None + yield self.engine.close_spider_before_start( + self.spider, reason=e.reason + ) except Exception: self.crawling = False if self.engine is not None: