Skip to content

Commit

Permalink
Migrate serialize_v0 to new API (as part of serialization layer) (#190
Browse files Browse the repository at this point in the history
)

* Migrate `serialize_v0` to new API.

This is the middle layer of the API design work (#172). We add a manifest abstract class to represent various manifests (#111 #112) and also ways to serialize a model directory into manifests and ways to verify the manifests.

For now, this only does what was formerly known as `serialize_v0`. The v1 and the manifest versions will come soon.

Note: This has a lot of inspiration from #112, but makes the API work with all the usecases we need to consider right now.

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Clarify some comments

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Encode name with base64

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Add another test case

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Empty commit to retrigger DCO check.

See dcoapp/app#211 (comment)

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

---------

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>
  • Loading branch information
mihaimaruseac committed Jun 5, 2024
1 parent 8fc2f24 commit e9071f1
Show file tree
Hide file tree
Showing 7 changed files with 520 additions and 3 deletions.
5 changes: 2 additions & 3 deletions model_signing/hashing/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
```python
>>> with open("/tmp/file", "w") as f:
... f.write("0123abcd")
>>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8)
>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
>>> digest = hasher.compute()
>>> digest.digest_hex
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
Expand Down Expand Up @@ -144,8 +144,7 @@ def __init__(
Args:
file: The file to hash. Use `set_file` to reset it.
content_hasher: A `hashing.HashEngine` instance used to compute the
digest of the file. This instance must not be used outside of this
instance. However, it may be pre-initialized with a header.
digest of the file.
start: The file offset to start reading from. Must be valid. Reset
with `set_shard`.
end: The file offset to start reading from. Must be stricly greater
Expand Down
13 changes: 13 additions & 0 deletions model_signing/manifest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
39 changes: 39 additions & 0 deletions model_signing/manifest/manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Machinery for representing a serialized representation of an ML model.
Currently, we only support a manifest that wraps around a digest. But, to
support incremental updates and partial signature verification, we need a
manifest that lists files and their digests. That will come in a future change,
soon.
"""

from abc import ABCMeta
from dataclasses import dataclass

from model_signing.hashing import hashing


class Manifest(metaclass=ABCMeta):
"""Generic manifest file to represent a model."""

pass


@dataclass
class DigestManifest(Manifest):
"""A manifest that is just a hash."""

digest: hashing.Digest
13 changes: 13 additions & 0 deletions model_signing/serializing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
120 changes: 120 additions & 0 deletions model_signing/serializing/dfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Model serializers that build a single hash out of a DFS traversal."""

import base64
import pathlib
from typing import Callable
from typing_extensions import override

from model_signing.hashing import file
from model_signing.hashing import hashing
from model_signing.manifest import manifest
from model_signing.serializing import serializing


def _check_file_or_directory(path: pathlib.Path) -> bool:
"""Checks that the given path is either a file or a directory.
There is no support for sockets, pipes, or any other operating system
concept abstracted as a file.
Furthermore, this would return False if the path is a broken symlink, if it
doesn't exists or if there are permission errors.
"""
return path.is_file() or path.is_dir()


def _build_header(*, entry_name: str, entry_type: str) -> bytes:
"""Builds a header to encode a path with given name and type.
Args:
entry_name: The name of the entry to build the header for.
entry_type: The type of the entry (file or directory).
"""
encoded_type = entry_type.encode("utf-8")
# Prevent confusion if name has a "." inside by encoding to base64.
encoded_name = base64.b64encode(entry_name.encode("utf-8"))
# Note: make sure to end with a ".".
return b".".join([encoded_type, encoded_name, b""])


class DFSSerializer(serializing.Serializer):
"""Serializer for a model that performs a traversal of the model directory.
This serializer produces a single hash for the entire model. If the model is
a file, the hash is the digest of the file. If the model is a directory, we
perform a depth-first traversal of the directory, hash each individual files
and aggregate the hashes together.
"""

def __init__(
self,
file_hasher: file.FileHasher,
merge_hasher_factory: Callable[[], hashing.StreamingHashEngine],
):
"""Initializes an instance to hash a file with a specific `HashEngine`.
Args:
hasher: The hash engine used to hash the individual files.
merge_hasher_factory: A callable that returns a
`hashing.StreamingHashEngine` instance used to merge individual
file digests to compute an aggregate digest.
"""
self._file_hasher = file_hasher
self._merge_hasher_factory = merge_hasher_factory

@override
def serialize(self, model_path: pathlib.Path) -> manifest.Manifest:
# TODO(mihaimaruseac): Add checks to exclude symlinks if desired
if not _check_file_or_directory(model_path):
raise ValueError(
f"Cannot use '{model_path}' as file or directory. It could be a"
" special file, it could be missing, or there might be a"
" permission issue."
)

if model_path.is_file():
self._file_hasher.set_file(model_path)
return manifest.DigestManifest(self._file_hasher.compute())

return manifest.DigestManifest(self._dfs(model_path))

def _dfs(self, directory: pathlib.Path) -> hashing.Digest:
# TODO(mihaimaruseac): Add support for excluded files
children = sorted([x for x in directory.iterdir()])

hasher = self._merge_hasher_factory()
for child in children:
if not _check_file_or_directory(child):
raise ValueError(
f"Cannot use '{child}' as file or directory. It could be a"
" special file, it could be missing, or there might be a"
" permission issue."
)

if child.is_file():
header = _build_header(entry_name=child.name, entry_type="file")
hasher.update(header)
self._file_hasher.set_file(child)
digest = self._file_hasher.compute()
hasher.update(digest.digest_value)
else:
header = _build_header(entry_name=child.name, entry_type="dir")
hasher.update(header)
digest = self._dfs(child)
hasher.update(digest.digest_value)

return hasher.compute()
Loading

0 comments on commit e9071f1

Please sign in to comment.