Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate scrapy.item.BaseItem #4534

Merged
merged 12 commits into from May 15, 2020
4 changes: 2 additions & 2 deletions docs/faq.rst
Expand Up @@ -342,14 +342,14 @@ method for this purpose. For example::

from copy import deepcopy

from scrapy.item import BaseItem
from scrapy.item import Item


class MultiplyItemsMiddleware:

def process_spider_output(self, response, result, spider):
for item in result:
if isinstance(item, (BaseItem, dict)):
if isinstance(item, (Item, dict)):
for _ in range(item['multiply_by']):
yield deepcopy(item)

Expand Down
2 changes: 0 additions & 2 deletions docs/topics/items.rst
Expand Up @@ -257,6 +257,4 @@ Field objects
Other classes related to Item
=============================

.. autoclass:: BaseItem

.. autoclass:: ItemMeta
4 changes: 2 additions & 2 deletions pytest.ini
Expand Up @@ -153,7 +153,7 @@ flake8-ignore =
scrapy/exceptions.py E501
scrapy/exporters.py E501
scrapy/interfaces.py E501
scrapy/item.py E501 E128
scrapy/item.py E501
scrapy/link.py E501
scrapy/logformatter.py E501
scrapy/mail.py E402 E128 E501
Expand Down Expand Up @@ -204,7 +204,7 @@ flake8-ignore =
tests/test_http_headers.py E501
tests/test_http_request.py E402 E501 E128 E128
tests/test_http_response.py E501 E128
tests/test_item.py E128 F841
tests/test_item.py E128 F841 E501
tests/test_link.py E501
tests/test_linkextractors.py E501 E128
tests/test_loader.py E501 E741 E128
Expand Down
4 changes: 2 additions & 2 deletions scrapy/commands/parse.py
Expand Up @@ -5,7 +5,7 @@

from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.item import _BaseItem
from scrapy.utils import display
from scrapy.utils.conf import arglist_to_dict
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
Expand Down Expand Up @@ -117,7 +117,7 @@ def run_callback(self, response, callback, cb_kwargs=None):
items, requests = [], []

for x in iterate_spider_output(callback(response, **cb_kwargs)):
if isinstance(x, (BaseItem, dict)):
if isinstance(x, (_BaseItem, dict)):
items.append(x)
elif isinstance(x, Request):
requests.append(x)
Expand Down
8 changes: 4 additions & 4 deletions scrapy/contracts/default.py
@@ -1,6 +1,6 @@
import json

from scrapy.item import BaseItem
from scrapy.item import _BaseItem
from scrapy.http import Request
from scrapy.exceptions import ContractFail

Expand Down Expand Up @@ -51,8 +51,8 @@ class ReturnsContract(Contract):
objects = {
'request': Request,
'requests': Request,
'item': (BaseItem, dict),
'items': (BaseItem, dict),
'item': (_BaseItem, dict),
'items': (_BaseItem, dict),
}

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -103,7 +103,7 @@ class ScrapesContract(Contract):

def post_process(self, output):
for x in output:
if isinstance(x, (BaseItem, dict)):
if isinstance(x, (_BaseItem, dict)):
missing = [arg for arg in self.args if arg not in x]
if missing:
raise ContractFail(
Expand Down
4 changes: 2 additions & 2 deletions scrapy/core/scraper.py
Expand Up @@ -14,7 +14,7 @@
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy import signals
from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.item import _BaseItem
from scrapy.core.spidermw import SpiderMiddlewareManager


Expand Down Expand Up @@ -191,7 +191,7 @@ def _process_spidermw_output(self, output, request, response, spider):
"""
if isinstance(output, Request):
self.crawler.engine.crawl(request=output, spider=spider)
elif isinstance(output, (BaseItem, dict)):
elif isinstance(output, (_BaseItem, dict)):
self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
Expand Down
4 changes: 2 additions & 2 deletions scrapy/exporters.py
Expand Up @@ -12,7 +12,7 @@

from scrapy.utils.serialize import ScrapyJSONEncoder
from scrapy.utils.python import to_bytes, to_unicode, is_listlike
from scrapy.item import BaseItem
from scrapy.item import _BaseItem
from scrapy.exceptions import ScrapyDeprecationWarning


Expand Down Expand Up @@ -312,7 +312,7 @@ def serialize_field(self, field, name, value):
return serializer(value)

def _serialize_value(self, value):
if isinstance(value, BaseItem):
if isinstance(value, _BaseItem):
return self.export_item(value)
if isinstance(value, dict):
return dict(self._serialize_dict(value))
Expand Down
64 changes: 45 additions & 19 deletions scrapy/item.py
Expand Up @@ -14,28 +14,39 @@
from scrapy.utils.trackref import object_ref


class BaseItem(object_ref):
"""Base class for all scraped items.
class _BaseItem(object_ref):
"""
Temporary class used internally to avoid the deprecation
warning raised by isinstance checks using BaseItem.
"""
pass

In Scrapy, an object is considered an *item* if it is an instance of either
:class:`BaseItem` or :class:`dict`. For example, when the output of a
spider callback is evaluated, only instances of :class:`BaseItem` or
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.

If you need instances of a custom class to be considered items by Scrapy,
you must inherit from either :class:`BaseItem` or :class:`dict`.
class _BaseItemMeta(ABCMeta):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we are trying to reinvent create_deprecated_class util from

def create_deprecated_class(
name,
new_class,
clsdict=None,
warn_category=ScrapyDeprecationWarning,
warn_once=True,
old_class_path=None,
new_class_path=None,
subclass_warn_message="{cls} inherits from deprecated class {old}, please inherit from {new}.",
instance_warn_message="{cls} is deprecated, instantiate {new} instead."
):
"""
Return a "deprecated" class that causes its subclasses to issue a warning.
Subclasses of ``new_class`` are considered subclasses of this class.
It also warns when the deprecated class is instantiated, but do not when
its subclasses are instantiated.
It can be used to rename a base class in a library. For example, if we
have
class OldName(SomeClass):
# ...
and we want to rename it to NewName, we can do the following::
class NewName(SomeClass):
# ...
OldName = create_deprecated_class('OldName', NewName)
Then, if user class inherits from OldName, warning is issued. Also, if
some code uses ``issubclass(sub, OldName)`` or ``isinstance(sub(), OldName)``
checks they'll still return True if sub is a subclass of NewName instead of
OldName.
"""
class DeprecatedClass(new_class.__class__):
deprecated_class = None
warned_on_subclass = False
def __new__(metacls, name, bases, clsdict_):
cls = super(DeprecatedClass, metacls).__new__(metacls, name, bases, clsdict_)
if metacls.deprecated_class is None:
metacls.deprecated_class = cls
return cls
def __init__(cls, name, bases, clsdict_):
meta = cls.__class__
old = meta.deprecated_class
if old in bases and not (warn_once and meta.warned_on_subclass):
meta.warned_on_subclass = True
msg = subclass_warn_message.format(cls=_clspath(cls),
old=_clspath(old, old_class_path),
new=_clspath(new_class, new_class_path))
if warn_once:
msg += ' (warning only on first subclass, there may be others)'
warnings.warn(msg, warn_category, stacklevel=2)
super(DeprecatedClass, cls).__init__(name, bases, clsdict_)
# see https://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass
# and https://docs.python.org/reference/datamodel.html#customizing-instance-and-subclass-checks
# for implementation details
def __instancecheck__(cls, inst):
return any(cls.__subclasscheck__(c)
for c in {type(inst), inst.__class__})
def __subclasscheck__(cls, sub):
if cls is not DeprecatedClass.deprecated_class:
# we should do the magic only if second `issubclass` argument
# is the deprecated class itself - subclasses of the
# deprecated class should not use custom `__subclasscheck__`
# method.
return super(DeprecatedClass, cls).__subclasscheck__(sub)
if not inspect.isclass(sub):
raise TypeError("issubclass() arg 1 must be a class")
mro = getattr(sub, '__mro__', ())
return any(c in {cls, new_class} for c in mro)
def __call__(cls, *args, **kwargs):
old = DeprecatedClass.deprecated_class
if cls is old:
msg = instance_warn_message.format(cls=_clspath(cls, old_class_path),
new=_clspath(new_class, new_class_path))
warnings.warn(msg, warn_category, stacklevel=2)
return super(DeprecatedClass, cls).__call__(*args, **kwargs)
deprecated_cls = DeprecatedClass(name, (new_class,), clsdict or {})
try:
frm = inspect.stack()[1]
parent_module = inspect.getmodule(frm[0])
if parent_module is not None:
deprecated_cls.__module__ = parent_module.__name__
except Exception as e:
# Sometimes inspect.stack() fails (e.g. when the first import of
# deprecated class is in jinja2 template). __module__ attribute is not
# important enough to raise an exception as users may be unable
# to fix inspect.stack() errors.
warnings.warn("Error detecting parent module: %r" % e)
return deprecated_cls

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked (briefly) at that function before doing these changes, for some reason it didn't appear appropriate for this case. But I can't actually remember what that reason was, so I'll check again 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I tried this and remembered why I didn't use it. If I understand correctly, what you suggest is to do BaseItem = create_deprecated_class('BaseItem', Item). The problem I see is that we don't want the deprecated BaseItem to have the functionality of Item, i.e. the field capabilities provided by the ItemMeta meta class:

In [1]: from scrapy.item import Item, Field 
   ...: from scrapy.utils.deprecate import create_deprecated_class 
   ...:  
   ...: BaseItem = create_deprecated_class('BaseItem', Item, old_class_path="scrapy.item.BaseItem")  
   ...:  
   ...: class TestItem(BaseItem): 
   ...:     foo = Field() 
   ...:  
   ...: print(TestItem(foo="bar"))                                                                                                                                                                                                            
/.../scrapy/venv-scrapy/bin/ipython:6: ScrapyDeprecationWarning: __main__.TestItem inherits from deprecated class scrapy.item.BaseItem, please inherit from scrapy.item.Item. (warning only on first subclass, there may be others)
  
{'foo': 'bar'}

@dangra Please correct me if I'm wrong. Thanks!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ping @dangra 😄

def __instancecheck__(cls, instance):
if cls is BaseItem:
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super().__instancecheck__(instance)

Unlike instances of :class:`dict`, instances of :class:`BaseItem` may be
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.

class BaseItem(_BaseItem, metaclass=_BaseItemMeta):
"""
Deprecated, please use :class:`scrapy.item.Item` instead
"""
pass
kmike marked this conversation as resolved.
Show resolved Hide resolved

def __new__(cls, *args, **kwargs):
if issubclass(cls, BaseItem) and not (issubclass(cls, Item) or issubclass(cls, DictItem)):
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super(BaseItem, cls).__new__(cls, *args, **kwargs)


class Field(dict):
"""Container of field metadata"""


class ItemMeta(ABCMeta):
class ItemMeta(_BaseItemMeta):
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
"""Metaclass_ of :class:`Item` that handles field definitions.

.. _metaclass: https://realpython.com/python-metaclasses
Expand Down Expand Up @@ -68,8 +79,7 @@ class DictItem(MutableMapping, BaseItem):

def __new__(cls, *args, **kwargs):
if issubclass(cls, DictItem) and not issubclass(cls, Item):
warn('scrapy.item.DictItem is deprecated, please use '
'scrapy.item.Item instead',
warn('scrapy.item.DictItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super(DictItem, cls).__new__(cls, *args, **kwargs)

Expand All @@ -86,8 +96,7 @@ def __setitem__(self, key, value):
if key in self.fields:
self._values[key] = value
else:
raise KeyError("%s does not support field: %s" %
(self.__class__.__name__, key))
raise KeyError("%s does not support field: %s" % (self.__class__.__name__, key))

def __delitem__(self, key):
del self._values[key]
Expand All @@ -99,8 +108,7 @@ def __getattr__(self, name):

def __setattr__(self, name, value):
if not name.startswith('_'):
raise AttributeError("Use item[%r] = %r to set field value" %
(name, value))
raise AttributeError("Use item[%r] = %r to set field value" % (name, value))
super(DictItem, self).__setattr__(name, value)

def __len__(self):
Expand All @@ -127,4 +135,22 @@ def deepcopy(self):


class Item(DictItem, metaclass=ItemMeta):
pass
"""
Base class for scraped items.

In Scrapy, an object is considered an *item* if it is an instance of either
:class:`Item` or :class:`dict`. For example, when the output of a
spider callback is evaluated, only instances of :class:`Item` or
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.

If you need instances of a custom class to be considered items by Scrapy,
you must inherit from either :class:`Item` or :class:`dict`.

Items offer the ability to declare :class:`Field` attributes, which can be
used to define metadata and control the way data is processed internally.
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
Please refer to the :ref:`documentation about fields <topics-items-fields>`
for additional information.

Unlike instances of :class:`dict`, instances of :class:`Item` may be
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
kmike marked this conversation as resolved.
Show resolved Hide resolved
"""
5 changes: 2 additions & 3 deletions scrapy/shell.py
Expand Up @@ -13,7 +13,7 @@
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.item import _BaseItem
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.utils.console import start_python_console
Expand All @@ -26,8 +26,7 @@

class Shell:

relevant_classes = (Crawler, Spider, Request, Response, BaseItem,
Settings)
relevant_classes = (Crawler, Spider, Request, Response, _BaseItem, Settings)

def __init__(self, crawler, update_vars=None, code=None):
self.crawler = crawler
Expand Down
2 changes: 1 addition & 1 deletion scrapy/spiders/feed.py
Expand Up @@ -52,7 +52,7 @@ def parse_nodes(self, response, nodes):
"""This method is called for the nodes matching the provided tag name
(itertag). Receives the response and an Selector for each node.
Overriding this method is mandatory. Otherwise, you spider won't work.
This method must return either a BaseItem, a Request, or a list
This method must return either an item, a request, or a list
containing any of them.
"""

Expand Down
4 changes: 2 additions & 2 deletions scrapy/utils/misc.py
Expand Up @@ -14,10 +14,10 @@

from scrapy.utils.datatypes import LocalWeakReferencedCache
from scrapy.utils.python import flatten, to_unicode
from scrapy.item import BaseItem
from scrapy.item import _BaseItem


_ITERABLE_SINGLE_VALUES = dict, BaseItem, str, bytes
_ITERABLE_SINGLE_VALUES = dict, _BaseItem, str, bytes


def arg_to_iter(arg):
Expand Down
4 changes: 2 additions & 2 deletions scrapy/utils/serialize.py
Expand Up @@ -5,7 +5,7 @@
from twisted.internet import defer

from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.item import _BaseItem


class ScrapyJSONEncoder(json.JSONEncoder):
Expand All @@ -26,7 +26,7 @@ def default(self, o):
return str(o)
elif isinstance(o, defer.Deferred):
return str(o)
elif isinstance(o, BaseItem):
elif isinstance(o, _BaseItem):
return dict(o)
elif isinstance(o, Request):
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
Expand Down