Skip to content

Commit

Permalink
Merge pull request #5077 from wRAR/deferred-typing
Browse files Browse the repository at this point in the history
Add typing for middleware and coroutine related code.
  • Loading branch information
wRAR committed Apr 13, 2021
2 parents 15edb3f + 08e4eaf commit 9bf9ab7
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 61 deletions.
12 changes: 8 additions & 4 deletions scrapy/core/downloader/middleware.py
Expand Up @@ -3,8 +3,12 @@
See documentation in docs/topics/downloader-middleware.rst
"""
from typing import Callable, Union

from twisted.internet import defer
from twisted.python.failure import Failure

from scrapy import Spider
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Request, Response
from scrapy.middleware import MiddlewareManager
Expand All @@ -29,9 +33,9 @@ def _add_middleware(self, mw):
if hasattr(mw, 'process_exception'):
self.methods['process_exception'].appendleft(mw.process_exception)

def download(self, download_func, request, spider):
def download(self, download_func: Callable, request: Request, spider: Spider):
@defer.inlineCallbacks
def process_request(request):
def process_request(request: Request):
for method in self.methods['process_request']:
response = yield deferred_from_coro(method(request=request, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
Expand All @@ -44,7 +48,7 @@ def process_request(request):
return (yield download_func(request=request, spider=spider))

@defer.inlineCallbacks
def process_response(response):
def process_response(response: Union[Response, Request]):
if response is None:
raise TypeError("Received None in process_response")
elif isinstance(response, Request):
Expand All @@ -62,7 +66,7 @@ def process_response(response):
return response

@defer.inlineCallbacks
def process_exception(failure):
def process_exception(failure: Failure):
exception = failure.value
for method in self.methods['process_exception']:
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
Expand Down
14 changes: 8 additions & 6 deletions scrapy/core/scraper.py
Expand Up @@ -3,12 +3,14 @@

import logging
from collections import deque
from collections.abc import Iterable
from typing import Union

from itemadapter import is_item
from twisted.internet import defer
from twisted.python.failure import Failure

from scrapy import signals
from scrapy import signals, Spider
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.http import Request, Response
Expand Down Expand Up @@ -118,7 +120,7 @@ def _scrape_next(self, spider):
response, request, deferred = self.slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)

def _scrape(self, result, request, spider):
def _scrape(self, result: Union[Response, Failure], request: Request, spider: Spider):
"""
Handle the downloaded response or failure through the spider callback/errback
"""
Expand All @@ -129,7 +131,7 @@ def _scrape(self, result, request, spider):
dfd.addCallback(self.handle_spider_output, request, result, spider)
return dfd

def _scrape2(self, result, request, spider):
def _scrape2(self, result: Union[Response, Failure], request: Request, spider: Spider):
"""
Handle the different cases of request's result been a Response or a Failure
"""
Expand All @@ -139,7 +141,7 @@ def _scrape2(self, result, request, spider):
dfd = self.call_spider(result, request, spider)
return dfd.addErrback(self._log_download_errors, result, request, spider)

def call_spider(self, result, request, spider):
def call_spider(self, result: Union[Response, Failure], request: Request, spider: Spider):
if isinstance(result, Response):
if getattr(result, "request", None) is None:
result.request = request
Expand All @@ -154,7 +156,7 @@ def call_spider(self, result, request, spider):
dfd.addErrback(request.errback)
return dfd.addCallback(iterate_spider_output)

def handle_spider_error(self, _failure, request, response, spider):
def handle_spider_error(self, _failure: Failure, request: Request, response: Response, spider: Spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
Expand All @@ -175,7 +177,7 @@ def handle_spider_error(self, _failure, request, response, spider):
spider=spider
)

def handle_spider_output(self, result, request, response, spider):
def handle_spider_output(self, result: Iterable, request: Request, response: Response, spider: Spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
Expand Down
34 changes: 23 additions & 11 deletions scrapy/core/spidermw.py
Expand Up @@ -4,18 +4,25 @@
See documentation in docs/topics/spider-middleware.rst
"""
from itertools import islice
from typing import Any, Callable, Generator, Iterable, Union

from twisted.internet.defer import Deferred
from twisted.python.failure import Failure

from scrapy import Request, Spider
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Response
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.python import MutableChain


def _isiterable(possible_iterator):
return hasattr(possible_iterator, '__iter__')
ScrapeFunc = Callable[[Union[Response, Failure], Request, Spider], Any]


def _isiterable(o) -> bool:
return isinstance(o, Iterable)


class SpiderMiddlewareManager(MiddlewareManager):
Expand All @@ -37,7 +44,8 @@ def _add_middleware(self, mw):
process_spider_exception = getattr(mw, 'process_spider_exception', None)
self.methods['process_spider_exception'].appendleft(process_spider_exception)

def _process_spider_input(self, scrape_func, response, request, spider):
def _process_spider_input(self, scrape_func: ScrapeFunc, response: Response, request: Request,
spider: Spider) -> Any:
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
Expand All @@ -51,7 +59,8 @@ def _process_spider_input(self, scrape_func, response, request, spider):
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)

def _evaluate_iterable(self, response, spider, iterable, exception_processor_index, recover_to):
def _evaluate_iterable(self, response: Response, spider: Spider, iterable: Iterable,
exception_processor_index: int, recover_to: MutableChain) -> Generator:
try:
for r in iterable:
yield r
Expand All @@ -62,7 +71,8 @@ def _evaluate_iterable(self, response, spider, iterable, exception_processor_ind
raise
recover_to.extend(exception_result)

def _process_spider_exception(self, response, spider, _failure, start_index=0):
def _process_spider_exception(self, response: Response, spider: Spider, _failure: Failure,
start_index: int = 0) -> Union[Failure, MutableChain]:
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
Expand All @@ -84,7 +94,8 @@ def _process_spider_exception(self, response, spider, _failure, start_index=0):
raise _InvalidOutput(msg)
return _failure

def _process_spider_output(self, response, spider, result, start_index=0):
def _process_spider_output(self, response: Response, spider: Spider,
result: Iterable, start_index: int = 0) -> MutableChain:
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
Expand All @@ -110,21 +121,22 @@ def _process_spider_output(self, response, spider, result, start_index=0):

return MutableChain(result, recovered)

def _process_callback_output(self, response, spider, result):
def _process_callback_output(self, response: Response, spider: Spider, result: Iterable) -> MutableChain:
recovered = MutableChain()
result = self._evaluate_iterable(response, spider, result, 0, recovered)
return MutableChain(self._process_spider_output(response, spider, result), recovered)

def scrape_response(self, scrape_func, response, request, spider):
def process_callback_output(result):
def scrape_response(self, scrape_func: ScrapeFunc, response: Response, request: Request,
spider: Spider) -> Deferred:
def process_callback_output(result: Iterable) -> MutableChain:
return self._process_callback_output(response, spider, result)

def process_spider_exception(_failure):
def process_spider_exception(_failure: Failure) -> Union[Failure, MutableChain]:
return self._process_spider_exception(response, spider, _failure)

dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider)
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
return dfd

def process_start_requests(self, start_requests, spider):
def process_start_requests(self, start_requests, spider: Spider) -> Deferred:
return self._process_chain('process_start_requests', start_requests, spider)
25 changes: 15 additions & 10 deletions scrapy/middleware.py
@@ -1,8 +1,13 @@
from collections import defaultdict, deque
import logging
import pprint
from collections import defaultdict, deque
from typing import Callable, Deque, Dict

from twisted.internet.defer import Deferred

from scrapy import Spider
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both

Expand All @@ -16,16 +21,16 @@ class MiddlewareManager:

def __init__(self, *middlewares):
self.middlewares = middlewares
self.methods = defaultdict(deque)
self.methods: Dict[str, Deque[Callable]] = defaultdict(deque)
for mw in middlewares:
self._add_middleware(mw)

@classmethod
def _get_mwlist_from_settings(cls, settings):
def _get_mwlist_from_settings(cls, settings: Settings) -> list:
raise NotImplementedError

@classmethod
def from_settings(cls, settings, crawler=None):
def from_settings(cls, settings: Settings, crawler=None):
mwlist = cls._get_mwlist_from_settings(settings)
middlewares = []
enabled = []
Expand All @@ -52,24 +57,24 @@ def from_settings(cls, settings, crawler=None):
def from_crawler(cls, crawler):
return cls.from_settings(crawler.settings, crawler)

def _add_middleware(self, mw):
def _add_middleware(self, mw) -> None:
if hasattr(mw, 'open_spider'):
self.methods['open_spider'].append(mw.open_spider)
if hasattr(mw, 'close_spider'):
self.methods['close_spider'].appendleft(mw.close_spider)

def _process_parallel(self, methodname, obj, *args):
def _process_parallel(self, methodname: str, obj, *args) -> Deferred:
return process_parallel(self.methods[methodname], obj, *args)

def _process_chain(self, methodname, obj, *args):
def _process_chain(self, methodname: str, obj, *args) -> Deferred:
return process_chain(self.methods[methodname], obj, *args)

def _process_chain_both(self, cb_methodname, eb_methodname, obj, *args):
def _process_chain_both(self, cb_methodname: str, eb_methodname: str, obj, *args) -> Deferred:
return process_chain_both(self.methods[cb_methodname],
self.methods[eb_methodname], obj, *args)

def open_spider(self, spider):
def open_spider(self, spider: Spider) -> Deferred:
return self._process_parallel('open_spider', spider)

def close_spider(self, spider):
def close_spider(self, spider: Spider) -> Deferred:
return self._process_parallel('close_spider', spider)
5 changes: 4 additions & 1 deletion scrapy/utils/asyncgen.py
@@ -1,4 +1,7 @@
async def collect_asyncgen(result):
from collections.abc import AsyncIterable


async def collect_asyncgen(result: AsyncIterable):
results = []
async for x in result:
results.append(x)
Expand Down

0 comments on commit 9bf9ab7

Please sign in to comment.