Skip to content

Commit

Permalink
Bandit: allow-list lxml usages (#6265)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Mar 1, 2024
1 parent aa1bf69 commit bf14935
Show file tree
Hide file tree
Showing 8 changed files with 15 additions and 21 deletions.
2 changes: 0 additions & 2 deletions .bandit.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
skips:
- B101 # assert_used, needed for mypy
- B320 # xml_bad_etree
- B321 # ftplib, https://github.com/scrapy/scrapy/issues/4180
- B402 # import_ftplib, https://github.com/scrapy/scrapy/issues/4180
- B410 # import_lxml
- B411 # import_xmlrpclib, https://github.com/PyCQA/bandit/issues/1082
- B503 # ssl_with_bad_defaults
exclude_dirs: ['tests']
17 changes: 6 additions & 11 deletions scrapy/http/request/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,16 @@
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast
from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit

from lxml.html import (
FormElement,
HTMLParser,
InputElement,
MultipleSelectOptions,
SelectElement,
TextareaElement,
)
from parsel.selector import create_root_node
from lxml.html import FormElement # nosec
from lxml.html import InputElement # nosec
from lxml.html import MultipleSelectOptions # nosec
from lxml.html import SelectElement # nosec
from lxml.html import TextareaElement # nosec
from w3lib.html import strip_html5_whitespace

from scrapy.http.request import Request
from scrapy.http.response.text import TextResponse
from scrapy.utils.python import is_listlike, to_bytes
from scrapy.utils.response import get_base_url

if TYPE_CHECKING:
# typing.Self requires Python 3.11
Expand Down Expand Up @@ -120,7 +115,7 @@ def _get_form(
formxpath: Optional[str],
) -> FormElement:
"""Find the wanted form element within the given response."""
root = create_root_node(response.text, HTMLParser, base_url=get_base_url(response))
root = response.selector.root
forms = root.xpath("//form")
if not forms:
raise ValueError(f"No <form> element found in {response}")
Expand Down
2 changes: 1 addition & 1 deletion scrapy/linkextractors/lxmlhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from functools import partial
from urllib.parse import urljoin, urlparse

from lxml import etree
from lxml import etree # nosec
from parsel.csstranslator import HTMLTranslator
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
Expand Down
3 changes: 2 additions & 1 deletion scrapy/selector/unified.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from scrapy.http import HtmlResponse, TextResponse, XmlResponse
from scrapy.utils.python import to_bytes
from scrapy.utils.response import get_base_url
from scrapy.utils.trackref import object_ref

__all__ = ["Selector", "SelectorList"]
Expand Down Expand Up @@ -88,7 +89,7 @@ def __init__(

if response is not None:
text = response.text
kwargs.setdefault("base_url", response.url)
kwargs.setdefault("base_url", get_base_url(response))

self.response = response

Expand Down
1 change: 0 additions & 1 deletion scrapy/utils/_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
try:
brotli.Decompressor.process
except AttributeError:

warn(
(
"You have brotlipy installed, and Scrapy will use it, but "
Expand Down
5 changes: 3 additions & 2 deletions scrapy/utils/iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
)
from warnings import warn

from lxml import etree
from lxml import etree # nosec

from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response, TextResponse
from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode

if TYPE_CHECKING:
from lxml._types import SupportsReadClose
from lxml._types import SupportsReadClose # nosec

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -101,6 +101,7 @@ def xmliter_lxml(
cast("SupportsReadClose[bytes]", reader),
encoding=reader.encoding,
events=("end", "start-ns"),
resolve_entities=False,
huge_tree=True,
)
selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
Expand Down
4 changes: 2 additions & 2 deletions scrapy/utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Any, Dict, Generator, Iterator, Optional
from urllib.parse import urljoin

import lxml.etree
import lxml.etree # nosec


class Sitemap:
Expand All @@ -19,7 +19,7 @@ def __init__(self, xmltext: str):
xmlp = lxml.etree.XMLParser(
recover=True, remove_comments=True, resolve_entities=False
)
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
self._root = lxml.etree.fromstring(xmltext, parser=xmlp) # nosec
rt = self._root.tag
self.type = self._root.tag.split("}", 1)[1] if "}" in rt else rt

Expand Down
2 changes: 1 addition & 1 deletion scrapy/utils/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import cryptography
import cssselect
import lxml.etree
import lxml.etree # nosec
import parsel
import twisted
import w3lib
Expand Down

0 comments on commit bf14935

Please sign in to comment.