Skip to content

Commit

Permalink
Use crc32 checksums in place of sha256
Browse files Browse the repository at this point in the history
crc32 is much faster to calculate when hardware acceleration for sha256
isn't present

closes #4447
  • Loading branch information
dralley committed Oct 17, 2023
1 parent d80a5c3 commit 2c3ede7
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGES/4447.feature
@@ -0,0 +1 @@
Use CRC32 checksums instead of SHA256 to improve import/export performance. Cryptographic checksums aren't required as we are only verifying the integrity of the files.
12 changes: 7 additions & 5 deletions pulpcore/app/tasks/export.py
@@ -1,4 +1,3 @@
import hashlib
import json
import logging
import os
Expand Down Expand Up @@ -28,7 +27,7 @@
from pulpcore.app.models.content import Artifact, ContentArtifact
from pulpcore.app.serializers import PulpExportSerializer

from pulpcore.app.util import compute_file_hash, get_version_from_model
from pulpcore.app.util import compute_file_hash, get_version_from_model, Crc32Hasher
from pulpcore.app.importexport import (
export_versions,
export_artifacts,
Expand Down Expand Up @@ -386,6 +385,8 @@ def pulp_export(exporter_pk, params):
the_export.validated_start_versions = serializer.validated_data.get("start_versions", None)
the_export.validated_chunk_size = serializer.validated_data.get("chunk_size", None)

hasher = Crc32Hasher
checksum_type = "crc32"
try:
the_export.task = Task.current()

Expand Down Expand Up @@ -425,10 +426,10 @@ def pulp_export(exporter_pk, params):
os.remove(pathname)
raise
# compute the hashes
global_hash = hashlib.sha256()
global_hash = hasher()
paths = sorted([str(Path(p)) for p in glob(tarfile_fp + ".*")])
for a_file in paths:
a_hash = compute_file_hash(a_file, cumulative_hash=global_hash)
a_hash = compute_file_hash(a_file, hasher=hasher(), cumulative_hash=global_hash)
rslts[a_file] = a_hash
tarfile_hash = global_hash.hexdigest()

Expand All @@ -444,7 +445,7 @@ def pulp_export(exporter_pk, params):
os.remove(tarfile_fp)
raise
# compute the hash
tarfile_hash = compute_file_hash(tarfile_fp)
tarfile_hash = compute_file_hash(tarfile_fp, hasher=hasher())
rslts[tarfile_fp] = tarfile_hash

# store the outputfile/hash info
Expand All @@ -462,6 +463,7 @@ def pulp_export(exporter_pk, params):
"chunk_size": chunk_size,
"file": os.path.basename(tarfile_fp),
"global_hash": tarfile_hash,
"checksum_type": checksum_type,
},
"files": {},
}
Expand Down
23 changes: 14 additions & 9 deletions pulpcore/app/tasks/importer.py
Expand Up @@ -35,7 +35,7 @@
ContentArtifactResource,
RepositoryResource,
)
from pulpcore.app.util import compute_file_hash
from pulpcore.app.util import compute_file_hash, Crc32Hasher
from pulpcore.constants import TASK_STATES
from pulpcore.tasking.tasks import dispatch

Expand Down Expand Up @@ -357,18 +357,23 @@ def validate_toc(toc_filename):
)

errs = []
# validate the sha256 of the toc-entries

def verify_chunk_hash(chunk_path, expected_digest):
actual_digest = compute_file_hash(chunk_path, hasher=Crc32Hasher())
if actual_digest != expected_digest:
err_str = "File {} expected checksum : {}, computed checksum : {}".format(
chunk, expected_digest, actual_digest
)
errs.append(err_str)

# validate the checksum of the toc-entries
# gather errors for reporting at the end
chunks = sorted(the_toc["files"].keys())
data = dict(message="Validating Chunks", code="validate.chunks", total=len(chunks))
with ProgressReport(**data) as pb:
for chunk in pb.iter(chunks):
a_hash = compute_file_hash(os.path.join(base_dir, chunk))
if not a_hash == the_toc["files"][chunk]:
err_str = "File {} expected checksum : {}, computed checksum : {}".format(
chunk, the_toc["files"][chunk], a_hash
)
errs.append(err_str)
chunk_path = os.path.join(base_dir, chunk)
verify_chunk_hash(chunk_path, the_toc["files"][chunk])

# if there are any errors, report and fail
if errs:
Expand Down Expand Up @@ -417,7 +422,7 @@ def reassemble(the_toc, toc_dir, result_file):
exc_info=True,
)

combined_hash = compute_file_hash(result_file)
combined_hash = compute_file_hash(result_file, hasher=Crc32Hasher())
if combined_hash != the_toc["meta"]["global_hash"]:
raise ValidationError(
_("Mismatch between combined .tar checksum [{}] and originating [{}]).").format(
Expand Down
17 changes: 17 additions & 0 deletions pulpcore/app/util.py
@@ -1,4 +1,5 @@
import hashlib
import zlib
from functools import lru_cache
from gettext import gettext as _
import os
Expand Down Expand Up @@ -309,6 +310,22 @@ def compute_file_hash(filename, hasher=None, cumulative_hash=None, blocksize=819
return hasher.hexdigest()


class Crc32Hasher:
"""Wrapper to make the CRC32 implementation act like a standard hashlib hasher"""

def __init__(self):
self.hashval = 0

def update(self, data):
self.hashval = zlib.crc32(data, self.hashval)

def digest(self):
return str(self.hashval)

def hexdigest(self):
return hex(self.hashval)[2:]


def configure_analytics():
task_name = "pulpcore.app.tasks.analytics.post_analytics"
dispatch_interval = timedelta(days=1)
Expand Down

0 comments on commit 2c3ede7

Please sign in to comment.