Bandit: allow-list lxml usages (#6265)

scrapy · Mar 1, 2024 · bf14935 · bf14935
1 parent aa1bf69
commit bf14935
Show file tree

Hide file tree

Showing 8 changed files with 15 additions and 21 deletions.
diff --git a/.bandit.yml b/.bandit.yml
@@ -1,9 +1,7 @@
 skips:
 - B101  # assert_used, needed for mypy
-- B320  # xml_bad_etree
 - B321  # ftplib, https://github.com/scrapy/scrapy/issues/4180
 - B402  # import_ftplib, https://github.com/scrapy/scrapy/issues/4180
-- B410  # import_lxml
 - B411  # import_xmlrpclib, https://github.com/PyCQA/bandit/issues/1082
 - B503  # ssl_with_bad_defaults
 exclude_dirs: ['tests']
diff --git a/scrapy/http/request/form.py b/scrapy/http/request/form.py
@@ -10,21 +10,16 @@
 from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast
 from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit
 
-from lxml.html import (
-    FormElement,
-    HTMLParser,
-    InputElement,
-    MultipleSelectOptions,
-    SelectElement,
-    TextareaElement,
-)
-from parsel.selector import create_root_node
+from lxml.html import FormElement  # nosec
+from lxml.html import InputElement  # nosec
+from lxml.html import MultipleSelectOptions  # nosec
+from lxml.html import SelectElement  # nosec
+from lxml.html import TextareaElement  # nosec
 from w3lib.html import strip_html5_whitespace
 
 from scrapy.http.request import Request
 from scrapy.http.response.text import TextResponse
 from scrapy.utils.python import is_listlike, to_bytes
-from scrapy.utils.response import get_base_url
 
 if TYPE_CHECKING:
     # typing.Self requires Python 3.11
@@ -120,7 +115,7 @@ def _get_form(
     formxpath: Optional[str],
 ) -> FormElement:
     """Find the wanted form element within the given response."""
-    root = create_root_node(response.text, HTMLParser, base_url=get_base_url(response))
+    root = response.selector.root
     forms = root.xpath("//form")
     if not forms:
         raise ValueError(f"No <form> element found in {response}")

diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
@@ -7,7 +7,7 @@
 from functools import partial
 from urllib.parse import urljoin, urlparse
 
-from lxml import etree
+from lxml import etree  # nosec
 from parsel.csstranslator import HTMLTranslator
 from w3lib.html import strip_html5_whitespace
 from w3lib.url import canonicalize_url, safe_url_string

diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py
@@ -8,6 +8,7 @@
 
 from scrapy.http import HtmlResponse, TextResponse, XmlResponse
 from scrapy.utils.python import to_bytes
+from scrapy.utils.response import get_base_url
 from scrapy.utils.trackref import object_ref
 
 __all__ = ["Selector", "SelectorList"]
@@ -88,7 +89,7 @@ def __init__(
 
         if response is not None:
             text = response.text
-            kwargs.setdefault("base_url", response.url)
+            kwargs.setdefault("base_url", get_base_url(response))
 
         self.response = response
 

diff --git a/scrapy/utils/_compression.py b/scrapy/utils/_compression.py
@@ -12,7 +12,6 @@
     try:
         brotli.Decompressor.process
     except AttributeError:
-
         warn(
             (
                 "You have brotlipy installed, and Scrapy will use it, but "

diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py
@@ -18,15 +18,15 @@
 )
 from warnings import warn
 
-from lxml import etree
+from lxml import etree  # nosec
 
 from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Response, TextResponse
 from scrapy.selector import Selector
 from scrapy.utils.python import re_rsearch, to_unicode
 
 if TYPE_CHECKING:
-    from lxml._types import SupportsReadClose
+    from lxml._types import SupportsReadClose  # nosec
 
 logger = logging.getLogger(__name__)
 
@@ -101,6 +101,7 @@ def xmliter_lxml(
         cast("SupportsReadClose[bytes]", reader),
         encoding=reader.encoding,
         events=("end", "start-ns"),
+        resolve_entities=False,
         huge_tree=True,
     )
     selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)

diff --git a/scrapy/utils/sitemap.py b/scrapy/utils/sitemap.py
@@ -8,7 +8,7 @@
 from typing import Any, Dict, Generator, Iterator, Optional
 from urllib.parse import urljoin
 
-import lxml.etree
+import lxml.etree  # nosec
 
 
 class Sitemap:
@@ -19,7 +19,7 @@ def __init__(self, xmltext: str):
         xmlp = lxml.etree.XMLParser(
             recover=True, remove_comments=True, resolve_entities=False
         )
-        self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
+        self._root = lxml.etree.fromstring(xmltext, parser=xmlp)  # nosec
         rt = self._root.tag
         self.type = self._root.tag.split("}", 1)[1] if "}" in rt else rt
 

diff --git a/scrapy/utils/versions.py b/scrapy/utils/versions.py
@@ -4,7 +4,7 @@
 
 import cryptography
 import cssselect
-import lxml.etree
+import lxml.etree  # nosec
 import parsel
 import twisted
 import w3lib