Mark hashlib usages as not intended for security (#6264)

scrapy · Mar 1, 2024 · aa1bf69 · aa1bf69
1 parent 4cd94aa
commit aa1bf69
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 14 deletions.
diff --git a/.bandit.yml b/.bandit.yml
@@ -2,7 +2,6 @@ skips:
 - B101  # assert_used, needed for mypy
 - B320  # xml_bad_etree
 - B321  # ftplib, https://github.com/scrapy/scrapy/issues/4180
-- B324  # hashlib "Use of weak SHA1 hash for security"
 - B402  # import_ftplib, https://github.com/scrapy/scrapy/issues/4180
 - B410  # import_lxml
 - B411  # import_xmlrpclib, https://github.com/PyCQA/bandit/issues/1082

diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py
@@ -16,7 +16,7 @@
 from io import BytesIO
 from os import PathLike
 from pathlib import Path
-from typing import DefaultDict, Optional, Set, Union
+from typing import IO, DefaultDict, Optional, Set, Union
 from urllib.parse import urlparse
 
 from itemadapter import ItemAdapter
@@ -31,7 +31,6 @@
 from scrapy.utils.datatypes import CaseInsensitiveDict
 from scrapy.utils.ftp import ftp_store_file
 from scrapy.utils.log import failure_to_exc_info
-from scrapy.utils.misc import md5sum
 from scrapy.utils.python import to_bytes
 from scrapy.utils.request import referer_str
 
@@ -42,6 +41,23 @@ def _to_string(path: Union[str, PathLike]) -> str:
     return str(path)  # convert a Path object to string
 
 
+def _md5sum(file: IO) -> str:
+    """Calculate the md5 checksum of a file-like object without reading its
+    whole content in memory.
+
+    >>> from io import BytesIO
+    >>> _md5sum(BytesIO(b'file content to hash'))
+    '784406af91dd5a54fbb9c84c2236595a'
+    """
+    m = hashlib.md5()  # nosec
+    while True:
+        d = file.read(8096)
+        if not d:
+            break
+        m.update(d)
+    return m.hexdigest()
+
+
 class FileException(Exception):
     """General media error exception"""
 
@@ -70,7 +86,7 @@ def stat_file(self, path: Union[str, PathLike], info):
             return {}
 
         with absolute_path.open("rb") as f:
-            checksum = md5sum(f)
+            checksum = _md5sum(f)
 
         return {"last_modified": last_modified, "checksum": checksum}
 
@@ -299,7 +315,7 @@ def _stat_file(path):
                     ftp.set_pasv(False)
                 file_path = f"{self.basedir}/{path}"
                 last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
-                m = hashlib.md5()
+                m = hashlib.md5()  # nosec
                 ftp.retrbinary(f"RETR {file_path}", m.update)
                 return {"last_modified": last_modified, "checksum": m.hexdigest()}
             # The file doesn't exist
@@ -531,7 +547,7 @@ def get_media_requests(self, item, info):
     def file_downloaded(self, response, request, info, *, item=None):
         path = self.file_path(request, response=response, info=info, item=item)
         buf = BytesIO(response.body)
-        checksum = md5sum(buf)
+        checksum = _md5sum(buf)
         buf.seek(0)
         self.store.persist_file(path, buf, info)
         return checksum
@@ -542,7 +558,7 @@ def item_completed(self, results, item, info):
         return item
 
     def file_path(self, request, response=None, info=None, *, item=None):
-        media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
+        media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()  # nosec
         media_ext = Path(request.url).suffix
         # Handles empty and wild extensions by trying to guess the
         # mime type then extension or default to empty string otherwise

diff --git a/scrapy/pipelines/images.py b/scrapy/pipelines/images.py
@@ -17,11 +17,10 @@
 from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
 from scrapy.http import Request
 from scrapy.http.request import NO_CALLBACK
-from scrapy.pipelines.files import FileException, FilesPipeline
+from scrapy.pipelines.files import FileException, FilesPipeline, _md5sum
 
 # TODO: from scrapy.pipelines.media import MediaPipeline
 from scrapy.settings import Settings
-from scrapy.utils.misc import md5sum
 from scrapy.utils.python import get_func_args, to_bytes
 
 
@@ -128,7 +127,7 @@ def image_downloaded(self, response, request, info, *, item=None):
         for path, image, buf in self.get_images(response, request, info, item=item):
             if checksum is None:
                 buf.seek(0)
-                checksum = md5sum(buf)
+                checksum = _md5sum(buf)
             width, height = image.size
             self.store.persist_file(
                 path,
@@ -228,9 +227,9 @@ def item_completed(self, results, item, info):
         return item
 
     def file_path(self, request, response=None, info=None, *, item=None):
-        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
+        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()  # nosec
         return f"full/{image_guid}.jpg"
 
     def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
-        thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
+        thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()  # nosec
         return f"thumbs/{thumb_id}/{thumb_guid}.jpg"
diff --git a/scrapy/utils/misc.py b/scrapy/utils/misc.py
@@ -113,7 +113,15 @@ def md5sum(file: IO) -> str:
     >>> md5sum(BytesIO(b'file content to hash'))
     '784406af91dd5a54fbb9c84c2236595a'
     """
-    m = hashlib.md5()
+    warnings.warn(
+        (
+            "The scrapy.utils.misc.md5sum function is deprecated, and will be "
+            "removed in a future version of Scrapy."
+        ),
+        ScrapyDeprecationWarning,
+        stacklevel=2,
+    )
+    m = hashlib.md5()  # nosec
     while True:
         d = file.read(8096)
         if not d:

diff --git a/scrapy/utils/request.py b/scrapy/utils/request.py
@@ -111,7 +111,7 @@ def fingerprint(
             "headers": headers,
         }
         fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
-        cache[cache_key] = hashlib.sha1(fingerprint_json.encode()).digest()
+        cache[cache_key] = hashlib.sha1(fingerprint_json.encode()).digest()  # nosec
     return cache[cache_key]