Skip to content

Commit

Permalink
Scan callbacks/errbacks for return statements with values different t…
Browse files Browse the repository at this point in the history
…han None
  • Loading branch information
elacuesta committed Aug 26, 2019
1 parent 3abe7e6 commit 8d6c2ec
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 16 deletions.
20 changes: 12 additions & 8 deletions scrapy/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import load_object
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy import signals
Expand All @@ -18,6 +18,7 @@
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.utils.request import referer_str


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -99,11 +100,13 @@ def _check_if_closing(self, spider, slot):
def enqueue_scrape(self, response, request, spider):
slot = self.slot
dfd = slot.add_response_request(response, request)

def finish_scraping(_):
slot.finish_response(response, request)
self._check_if_closing(spider, slot)
self._scrape_next(spider, slot)
return _

dfd.addBoth(finish_scraping)
dfd.addErrback(
lambda f: logger.error('Scraper bug processing %(request)s',
Expand All @@ -123,7 +126,7 @@ def _scrape(self, response, request, spider):
callback/errback"""
assert isinstance(response, (Response, Failure))

dfd = self._scrape2(response, request, spider) # returns spiders processed output
dfd = self._scrape2(response, request, spider) # returns spiders processed output
dfd.addErrback(self.handle_spider_error, request, response, spider)
dfd.addCallback(self.handle_spider_output, request, response, spider)
return dfd
Expand All @@ -142,7 +145,10 @@ def _scrape2(self, request_result, request, spider):
def call_spider(self, result, request, spider):
result.request = request
dfd = defer_result(result)
dfd.addCallbacks(callback=request.callback or spider.parse,
callback = request.callback or spider.parse
warn_on_generator_with_return_value(spider, callback)
warn_on_generator_with_return_value(spider, request.errback)
dfd.addCallbacks(callback=callback,
errback=request.errback,
callbackKeywords=request.cb_kwargs)
return dfd.addCallback(iterate_spider_output)
Expand Down Expand Up @@ -172,8 +178,8 @@ def handle_spider_output(self, result, request, response, spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items,
self._process_spidermw_output, request, response, spider)
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
request, response, spider)
return dfd

def _process_spidermw_output(self, output, request, response, spider):
Expand All @@ -200,8 +206,7 @@ def _log_download_errors(self, spider_failure, download_failure, request, spider
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here)
"""
if (isinstance(download_failure, Failure) and
not download_failure.check(IgnoreRequest)):
if isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest):
if download_failure.frames:
logger.error('Error downloading %(request)s',
{'request': request},
Expand Down Expand Up @@ -242,4 +247,3 @@ def _itemproc_finished(self, output, item, response, spider):
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)

36 changes: 33 additions & 3 deletions scrapy/utils/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import copy
import collections
import warnings
import weakref

import six

Expand Down Expand Up @@ -307,19 +308,48 @@ class LocalCache(collections.OrderedDict):
"""Dictionary with a finite number of keys.
Older items expires first.
"""

def __init__(self, limit=None):
super(LocalCache, self).__init__()
self.limit = limit

def __setitem__(self, key, value):
while len(self) >= self.limit:
self.popitem(last=False)
if self.limit:
while len(self) >= self.limit:
self.popitem(last=False)
super(LocalCache, self).__setitem__(key, value)


class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
"""
A weakref.WeakKeyDictionary implementation that uses LocalCache as its
underlying data structure, making it ordered and capable of being size-limited.
Useful for memoization, while avoiding keeping received
arguments in memory only because of the cached references.
Note: like LocalCache and unlike weakref.WeakKeyDictionary,
it cannot be instantiated with an initial dictionary.
"""

def __init__(self, limit=None):
super(LocalWeakReferencedCache, self).__init__()
self.data = LocalCache(limit=limit)

def __setitem__(self, key, value):
try:
super(LocalWeakReferencedCache, self).__setitem__(key, value)
except TypeError:
pass # key is not weak-referenceable, skip caching

def __getitem__(self, key):
try:
return super(LocalWeakReferencedCache, self).__getitem__(key)
except TypeError:
return None # key is not weak-referenceable, it's not cached


class SequenceExclude(object):
"""Object to test if an item is NOT within some sequence."""

Expand Down
46 changes: 45 additions & 1 deletion scrapy/utils/misc.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
"""Helper functions which don't fit anywhere else"""
import ast
import inspect
import os
import re
import hashlib
import warnings
from contextlib import contextmanager
from importlib import import_module
from pkgutil import iter_modules
from textwrap import dedent

import six
from w3lib.html import replace_entities

from scrapy.utils.datatypes import LocalWeakReferencedCache
from scrapy.utils.python import flatten, to_unicode
from scrapy.item import BaseItem

Expand Down Expand Up @@ -42,7 +47,7 @@ def load_object(path):
except ValueError:
raise ValueError("Error loading object '%s': not a full path" % path)

module, name = path[:dot], path[dot+1:]
module, name = path[:dot], path[dot + 1:]
mod = import_module(module)

try:
Expand Down Expand Up @@ -162,3 +167,42 @@ def set_environ(**kwargs):
del os.environ[k]
else:
os.environ[k] = v


_generator_callbacks_cache = LocalWeakReferencedCache(limit=128)

def is_generator_with_return_value(callable):
"""
Returns True if a callable is a generator function which includes a
'return' statement with a value different than None, False otherwise
"""
if callable in _generator_callbacks_cache:
return _generator_callbacks_cache[callable]

def returns_none(return_node):
value = return_node.value
return value is None or isinstance(value, ast.NameConstant) and value.value is None

if inspect.isgeneratorfunction(callable):
tree = ast.parse(dedent(inspect.getsource(callable)))
for node in ast.walk(tree):
if isinstance(node, ast.Return) and not returns_none(node):
_generator_callbacks_cache[callable] = True
return _generator_callbacks_cache[callable]

_generator_callbacks_cache[callable] = False
return _generator_callbacks_cache[callable]

def warn_on_generator_with_return_value(spider, callable):
"""
Logs a warning if a callable is a generator function and includes
a 'return' statement with a value different than None
"""
if is_generator_with_return_value(callable):
warnings.warn(
'The "{}.{}" method is a generator and includes a "return" statement with a '
'value different than None. This could lead to unexpected behaviour. Please see '
'https://docs.python.org/3/reference/simple_stmts.html#the-return-statement '
'for details about the semantics of the "return" statement within generators'
.format(spider.__class__.__name__, callable.__name__), stacklevel=2,
)
84 changes: 82 additions & 2 deletions tests/test_utils_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
else:
from collections.abc import Mapping, MutableMapping

from scrapy.utils.datatypes import CaselessDict, SequenceExclude
from scrapy.http import Request
from scrapy.utils.datatypes import CaselessDict, SequenceExclude, LocalCache, LocalWeakReferencedCache


__doctests__ = ['scrapy.utils.datatypes']
Expand Down Expand Up @@ -242,6 +243,85 @@ def test_set(self):
for v in [-3, "test", 1.1]:
self.assertNotIn(v, d)


class LocalCacheTest(unittest.TestCase):

def test_cache_with_limit(self):
cache = LocalCache(limit=2)
cache['a'] = 1
cache['b'] = 2
cache['c'] = 3
self.assertEqual(len(cache), 2)
self.assertNotIn('a', cache)
self.assertIn('b', cache)
self.assertIn('c', cache)
self.assertEqual(cache['b'], 2)
self.assertEqual(cache['c'], 3)

def test_cache_without_limit(self):
max = 10**4
cache = LocalCache()
for x in range(max):
cache[str(x)] = x
self.assertEqual(len(cache), max)
for x in range(max):
self.assertIn(str(x), cache)
self.assertEqual(cache[str(x)], x)


class LocalWeakReferencedCacheTest(unittest.TestCase):

def test_cache_with_limit(self):
cache = LocalWeakReferencedCache(limit=2)
r1 = Request('https://example.org')
r2 = Request('https://example.com')
r3 = Request('https://example.net')
cache[r1] = 1
cache[r2] = 2
cache[r3] = 3
self.assertEqual(len(cache), 2)
self.assertNotIn(r1, cache)
self.assertIn(r2, cache)
self.assertIn(r3, cache)
self.assertEqual(cache[r2], 2)
self.assertEqual(cache[r3], 3)
del r2
self.assertEqual(len(cache), 1)

def test_cache_non_weak_referenceable_objects(self):
cache = LocalWeakReferencedCache()
k1 = None
k2 = 1
k3 = [1, 2, 3]
cache[k1] = 1
cache[k2] = 2
cache[k3] = 3
self.assertNotIn(k1, cache)
self.assertNotIn(k2, cache)
self.assertNotIn(k3, cache)
self.assertEqual(len(cache), 0)

def test_cache_without_limit(self):
max = 10**4
cache = LocalWeakReferencedCache()
refs = []
for x in range(max):
refs.append(Request('https://example.org/{}'.format(x)))
cache[refs[-1]] = x
self.assertEqual(len(cache), max)
for i, r in enumerate(refs):
self.assertIn(r, cache)
self.assertEqual(cache[r], i)
del r # delete reference to the last object in the list

# delete half of the objects, make sure that is reflected in the cache
for _ in range(max//2):
refs.pop()
self.assertEqual(len(cache), max//2)
for i, r in enumerate(refs):
self.assertIn(r, cache)
self.assertEqual(cache[r], i)


if __name__ == "__main__":
unittest.main()

4 changes: 2 additions & 2 deletions tests/test_utils_misc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

from scrapy.item import Item, Field
from scrapy.utils.misc import arg_to_iter, create_instance, load_object, set_environ, walk_modules

from tests import mock


__doctests__ = ['scrapy.utils.misc']


Expand Down Expand Up @@ -74,7 +74,7 @@ class TestItem(Item):
self.assertEqual(list(arg_to_iter(100)), [100])
self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3])
self.assertEqual(list(arg_to_iter({'a':1})), [{'a': 1}])
self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
self.assertEqual(list(arg_to_iter(TestItem(name="john"))), [TestItem(name="john")])

def test_create_instance(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import unittest

from scrapy.utils.misc import is_generator_with_return_value


class UtilsMiscPy3TestCase(unittest.TestCase):

def test_generators_with_return_statements(self):
"""
'return with argument inside generator' is a SyntaxError before Python 3.3
"""
def f():
yield 1
return 2

def g():
yield 1
return 'asdf'

def h():
yield 1
return None

def i():
yield 1
return

def j():
yield 1

def k():
yield 1
yield from g()

assert is_generator_with_return_value(f)
assert is_generator_with_return_value(g)
assert not is_generator_with_return_value(h)
assert not is_generator_with_return_value(i)
assert not is_generator_with_return_value(j)
assert not is_generator_with_return_value(k) # not inspecting recursively


if __name__ == '__main__':
unittest.main()

0 comments on commit 8d6c2ec

Please sign in to comment.