Skip to content

Commit

Permalink
Mark hashlib usages as not intended for security (#6264)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Mar 1, 2024
1 parent 4cd94aa commit aa1bf69
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 14 deletions.
1 change: 0 additions & 1 deletion .bandit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ skips:
- B101 # assert_used, needed for mypy
- B320 # xml_bad_etree
- B321 # ftplib, https://github.com/scrapy/scrapy/issues/4180
- B324 # hashlib "Use of weak SHA1 hash for security"
- B402 # import_ftplib, https://github.com/scrapy/scrapy/issues/4180
- B410 # import_lxml
- B411 # import_xmlrpclib, https://github.com/PyCQA/bandit/issues/1082
Expand Down
28 changes: 22 additions & 6 deletions scrapy/pipelines/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from io import BytesIO
from os import PathLike
from pathlib import Path
from typing import DefaultDict, Optional, Set, Union
from typing import IO, DefaultDict, Optional, Set, Union
from urllib.parse import urlparse

from itemadapter import ItemAdapter
Expand All @@ -31,7 +31,6 @@
from scrapy.utils.datatypes import CaseInsensitiveDict
from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str

Expand All @@ -42,6 +41,23 @@ def _to_string(path: Union[str, PathLike]) -> str:
return str(path) # convert a Path object to string


def _md5sum(file: IO) -> str:
"""Calculate the md5 checksum of a file-like object without reading its
whole content in memory.
>>> from io import BytesIO
>>> _md5sum(BytesIO(b'file content to hash'))
'784406af91dd5a54fbb9c84c2236595a'
"""
m = hashlib.md5() # nosec
while True:
d = file.read(8096)
if not d:
break
m.update(d)
return m.hexdigest()


class FileException(Exception):
"""General media error exception"""

Expand Down Expand Up @@ -70,7 +86,7 @@ def stat_file(self, path: Union[str, PathLike], info):
return {}

with absolute_path.open("rb") as f:
checksum = md5sum(f)
checksum = _md5sum(f)

return {"last_modified": last_modified, "checksum": checksum}

Expand Down Expand Up @@ -299,7 +315,7 @@ def _stat_file(path):
ftp.set_pasv(False)
file_path = f"{self.basedir}/{path}"
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
m = hashlib.md5()
m = hashlib.md5() # nosec
ftp.retrbinary(f"RETR {file_path}", m.update)
return {"last_modified": last_modified, "checksum": m.hexdigest()}
# The file doesn't exist
Expand Down Expand Up @@ -531,7 +547,7 @@ def get_media_requests(self, item, info):
def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf = BytesIO(response.body)
checksum = md5sum(buf)
checksum = _md5sum(buf)
buf.seek(0)
self.store.persist_file(path, buf, info)
return checksum
Expand All @@ -542,7 +558,7 @@ def item_completed(self, results, item, info):
return item

def file_path(self, request, response=None, info=None, *, item=None):
media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() # nosec
media_ext = Path(request.url).suffix
# Handles empty and wild extensions by trying to guess the
# mime type then extension or default to empty string otherwise
Expand Down
9 changes: 4 additions & 5 deletions scrapy/pipelines/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FileException, FilesPipeline
from scrapy.pipelines.files import FileException, FilesPipeline, _md5sum

# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
from scrapy.utils.python import get_func_args, to_bytes


Expand Down Expand Up @@ -128,7 +127,7 @@ def image_downloaded(self, response, request, info, *, item=None):
for path, image, buf in self.get_images(response, request, info, item=item):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
checksum = _md5sum(buf)
width, height = image.size
self.store.persist_file(
path,
Expand Down Expand Up @@ -228,9 +227,9 @@ def item_completed(self, results, item, info):
return item

def file_path(self, request, response=None, info=None, *, item=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() # nosec
return f"full/{image_guid}.jpg"

def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() # nosec
return f"thumbs/{thumb_id}/{thumb_guid}.jpg"
10 changes: 9 additions & 1 deletion scrapy/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,15 @@ def md5sum(file: IO) -> str:
>>> md5sum(BytesIO(b'file content to hash'))
'784406af91dd5a54fbb9c84c2236595a'
"""
m = hashlib.md5()
warnings.warn(
(
"The scrapy.utils.misc.md5sum function is deprecated, and will be "
"removed in a future version of Scrapy."
),
ScrapyDeprecationWarning,
stacklevel=2,
)
m = hashlib.md5() # nosec
while True:
d = file.read(8096)
if not d:
Expand Down
2 changes: 1 addition & 1 deletion scrapy/utils/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def fingerprint(
"headers": headers,
}
fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
cache[cache_key] = hashlib.sha1(fingerprint_json.encode()).digest()
cache[cache_key] = hashlib.sha1(fingerprint_json.encode()).digest() # nosec
return cache[cache_key]


Expand Down

0 comments on commit aa1bf69

Please sign in to comment.