scrapy · kmike · Dec 19, 2019 · Sep 30, 2019 · Oct 21, 2019 · Dec 5, 2019
diff --git a/docs/conf.py b/docs/conf.py
@@ -269,6 +269,10 @@
 
     # Never documented before, and deprecated now.
     r'^scrapy\.item\.DictItem$',
+    r'^scrapy\.linkextractors\.FilteringLinkExtractor$',
+
+    # Implementation detail of LxmlLinkExtractor
+    r'^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor',
 ]
 
 

diff --git a/docs/topics/link-extractors.rst b/docs/topics/link-extractors.rst
@@ -4,46 +4,33 @@
 Link Extractors
 ===============
 
-Link extractors are objects whose only purpose is to extract links from web
-pages (:class:`scrapy.http.Response` objects) which will be eventually
-followed.
+A link extractor is an object that extracts links from responses.
 
-There is ``scrapy.linkextractors.LinkExtractor`` available
-in Scrapy, but you can create your own custom Link Extractors to suit your
-needs by implementing a simple interface.
-
-The only public method that every link extractor has is ``extract_links``,
-which receives a :class:`~scrapy.http.Response` object and returns a list
-of :class:`scrapy.link.Link` objects. Link extractors are meant to be
-instantiated once and their ``extract_links`` method called several times
-with different responses to extract links to follow.
-
-Link extractors are used in the :class:`~scrapy.spiders.CrawlSpider`
-class (available in Scrapy), through a set of rules, but you can also use it in
-your spiders, even if you don't subclass from
-:class:`~scrapy.spiders.CrawlSpider`, as its purpose is very simple: to
-extract links.
+The ``__init__`` method of
+:class:`~scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor` takes settings that
+determine which links may be extracted. :class:`LxmlLinkExtractor.extract_links
+<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.extract_links>` returns a
+list of matching :class:`scrapy.link.Link` objects from a
+:class:`~scrapy.http.Response` object.
 
+Link extractors are used in :class:`~scrapy.spiders.CrawlSpider` spiders
+through a set of :class:`~scrapy.spiders.Rule` objects. You can also use link
+extractors in regular spiders.
 
 .. _topics-link-extractors-ref:
 
-Built-in link extractors reference
-==================================
+Link extractor reference
+========================
 
 .. module:: scrapy.linkextractors
    :synopsis: Link extractors classes
 
-Link extractors classes bundled with Scrapy are provided in the
-:mod:`scrapy.linkextractors` module.
-
-The default link extractor is ``LinkExtractor``, which is the same as
-:class:`~.LxmlLinkExtractor`::
+The link extractor class is
+:class:`scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor`. For convenience it
+can also be imported as ``scrapy.linkextractors.LinkExtractor``::
 
     from scrapy.linkextractors import LinkExtractor
 
-There used to be other link extractor classes in previous Scrapy versions,
-but they are deprecated now.
-
 LxmlLinkExtractor
 -----------------
 
@@ -152,4 +139,6 @@ LxmlLinkExtractor
         from elements or attributes which allow leading/trailing whitespaces).
     :type strip: boolean
 
+    .. automethod:: extract_links
+
 .. _scrapy.linkextractors: https://github.com/scrapy/scrapy/blob/master/scrapy/linkextractors/__init__.py
diff --git a/pytest.ini b/pytest.ini
@@ -88,7 +88,7 @@ flake8-ignore =
     scrapy/http/response/__init__.py E501 E128 W293 W291
     scrapy/http/response/text.py E501 W293 E128 E124
     # scrapy/linkextractors
-    scrapy/linkextractors/__init__.py E731 E501 E402
+    scrapy/linkextractors/__init__.py E731 E501 E402 W504
     scrapy/linkextractors/lxmlhtml.py E501 E731 E226
     # scrapy/loader
     scrapy/loader/__init__.py E501 E128

diff --git a/scrapy/linkextractors/__init__.py b/scrapy/linkextractors/__init__.py
@@ -7,10 +7,12 @@
 """
 import re
 from urllib.parse import urlparse
+from warnings import warn
 
 from parsel.csstranslator import HTMLTranslator
 from w3lib.url import canonicalize_url
 
+from scrapy.utils.deprecate import ScrapyDeprecationWarning
 from scrapy.utils.misc import arg_to_iter
 from scrapy.utils.url import (
     url_is_from_any_domain, url_has_any_extension,
@@ -51,6 +53,15 @@ class FilteringLinkExtractor(object):
 
     _csstranslator = HTMLTranslator()
 
+    def __new__(cls, *args, **kwargs):
+        from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
+        if (issubclass(cls, FilteringLinkExtractor) and
+                not issubclass(cls, LxmlLinkExtractor)):
+            warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
+                 'please use scrapy.linkextractors.LinkExtractor instead',
+                 ScrapyDeprecationWarning, stacklevel=2)
+        return super(FilteringLinkExtractor, cls).__new__(cls)
+
     def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
                  restrict_xpaths, canonicalize, deny_extensions, restrict_css, restrict_text):
 

diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
@@ -116,6 +116,14 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
                                                 restrict_text=restrict_text)
 
     def extract_links(self, response):
+        """Returns a list of :class:`~scrapy.link.Link` objects from the
+        specified :class:`response <scrapy.http.Response>`.
+
+        Only links that match the settings passed to the ``__init__`` method of
+        the link extractor are returned.
+
+        Duplicate links are omitted.
+        """
         base_url = get_base_url(response)
         if self.restrict_xpaths:
             docs = [subdoc

diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
@@ -1,10 +1,13 @@
 import re
 import unittest
+from warnings import catch_warnings
 
 import pytest
 
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import HtmlResponse, XmlResponse
 from scrapy.link import Link
+from scrapy.linkextractors import FilteringLinkExtractor
 from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
 from tests import get_testdata
 
@@ -506,3 +509,32 @@ def test_link_restrict_text(self):
     @pytest.mark.xfail
     def test_restrict_xpaths_with_html_entities(self):
         super(LxmlLinkExtractorTestCase, self).test_restrict_xpaths_with_html_entities()
+
+    def test_filteringlinkextractor_deprecation_warning(self):
+        """Make sure the FilteringLinkExtractor deprecation warning is not
+        issued for LxmlLinkExtractor"""
+        with catch_warnings(record=True) as warnings:
+            LxmlLinkExtractor()
+            self.assertEqual(len(warnings), 0)
+
+            class SubclassedItem(LxmlLinkExtractor):
+                pass
+
+            SubclassedItem()
+            self.assertEqual(len(warnings), 0)
+
+
+class FilteringLinkExtractorTest(unittest.TestCase):
+
+    def test_deprecation_warning(self):
+        args = [None] * 10
+        with catch_warnings(record=True) as warnings:
+            FilteringLinkExtractor(*args)
+            self.assertEqual(len(warnings), 1)
+            self.assertEqual(warnings[0].category, ScrapyDeprecationWarning)
+        with catch_warnings(record=True) as warnings:
+            class SubclassedFilteringLinkExtractor(FilteringLinkExtractor):
+                pass
+            SubclassedFilteringLinkExtractor(*args)
+            self.assertEqual(len(warnings), 1)
+            self.assertEqual(warnings[0].category, ScrapyDeprecationWarning)