From 72ce63f40262bf22888ff9b0118670e247579a94 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 10 Jul 2023 10:41:22 +0100 Subject: [PATCH 1/3] Support byte shuffle filter Both Zarr and HDF5 provide a byte shuffle filter which can be used to improve compression ratios of certain data patterns. There does not appear to be a Rust implementation of the byte shuffle filter, so this change includes one. It also adds a "filters" parameter to the API request data which accepts a list of filter definitions. Each filter definition is a dict with an "id" field and optional parameter fields specific to each filter. The shuffle filter has one parameter, "element_size". --- README.md | 6 +- scripts/client.py | 19 ++++- scripts/requirements.txt | 1 + scripts/upload_sample_data.py | 28 ++++--- src/filter_pipeline.rs | 73 ++++++++++++++-- src/filters.rs | 36 ++++++++ src/filters/shuffle.rs | 154 ++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/models.rs | 45 +++++++++- src/test_utils.rs | 2 + 10 files changed, 340 insertions(+), 25 deletions(-) create mode 100644 src/filters.rs create mode 100644 src/filters/shuffle.rs diff --git a/README.md b/README.md index 6536638..0a035c7 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,11 @@ with a JSON payload of the form: // Algorithm used to compress the data // - optional, defaults to no compression - "compression": "gzip|zlib" + "compression": {"id": "gzip|zlib"}, + + // List of algorithms used to filter the data + // - optional, defaults to no filters + "filters": [{"id": "shuffle", "element_size": 4}], } ``` diff --git a/scripts/client.py b/scripts/client.py index f4e2692..bbc1833 100644 --- a/scripts/client.py +++ b/scripts/client.py @@ -37,7 +37,8 @@ def get_args() -> argparse.Namespace: parser.add_argument("--order", default="C") #, choices=["C", "F"]) allow invalid for testing parser.add_argument("--selection", type=str) parser.add_argument("--compression", type=str) - parser.add_argument("--show-response-headers", action=argparse.BooleanOptionalAction) + parser.add_argument("--shuffle", action=argparse.BooleanOptionalAction) + parser.add_argument("--verbose", action=argparse.BooleanOptionalAction) return parser.parse_args() @@ -56,6 +57,14 @@ def build_request_data(args: argparse.Namespace) -> dict: request_data["shape"] = json.loads(args.shape) if args.selection: request_data["selection"] = json.loads(args.selection) + if args.compression: + request_data["compression"] = {"id": args.compression} + filters = [] + if args.shuffle: + element_size = 4 if "32" in args.dtype else 8 + filters.append({"id": "shuffle", "element_size": element_size}) + if filters: + request_data["filters"] = filters return {k: v for k, v in request_data.items() if v is not None} @@ -68,13 +77,13 @@ def request(url: str, username: str, password: str, request_data: dict): return response -def display(response, show_headers=False): +def display(response, verbose=False): #print(response.content) dtype = response.headers['x-activestorage-dtype'] shape = json.loads(response.headers['x-activestorage-shape']) result = np.frombuffer(response.content, dtype=dtype) result = result.reshape(shape) - if show_headers: + if verbose: print("\nResponse headers:", response.headers) print("\nResult:", result) else: @@ -92,10 +101,12 @@ def display_error(response): def main(): args = get_args() request_data = build_request_data(args) + if args.verbose: + print("\nRequest data:", request_data) url = f'{args.server}/v1/{args.operation}/' response = request(url, args.username, args.password, request_data) if response.ok: - display(response, show_headers=args.show_response_headers) + display(response, verbose=args.verbose) else: display_error(response) sys.exit(1) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index efc3ea4..7b421ce 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,4 @@ +numcodecs numpy requests s3fs diff --git a/scripts/upload_sample_data.py b/scripts/upload_sample_data.py index f8f8621..6273074 100644 --- a/scripts/upload_sample_data.py +++ b/scripts/upload_sample_data.py @@ -1,5 +1,6 @@ from enum import Enum import gzip +import numcodecs import numpy as np import pathlib import s3fs @@ -8,6 +9,7 @@ NUM_ITEMS = 10 OBJECT_PREFIX = "data" COMPRESSION_ALGS = [None, "gzip", "zlib"] +FILTER_ALGS = [None, "shuffle"] #Use enum which also subclasses string type so that # auto-generated OpenAPI schema can determine allowed dtypes @@ -38,14 +40,18 @@ def n_bytes(self): # Create numpy arrays and upload to S3 as bytes for compression in COMPRESSION_ALGS: compression_suffix = f"-{compression}" if compression else "" - for d in AllowedDatatypes.__members__.keys(): - obj_name = f'{OBJECT_PREFIX}-{d}{compression_suffix}.dat' - with s3_fs.open(bucket / obj_name, 'wb') as s3_file: - data = np.arange(NUM_ITEMS, dtype=d).tobytes() - if compression == "gzip": - data = gzip.compress(data) - elif compression == "zlib": - data = zlib.compress(data) - s3_file.write(data) - -print("Data upload successful. \nBucket contents:\n", s3_fs.ls(bucket)) + for filter in FILTER_ALGS: + filter_suffix = f"-{filter}" if filter else "" + for d in AllowedDatatypes: + obj_name = f'{OBJECT_PREFIX}-{d}{compression_suffix}{filter_suffix}.dat' + with s3_fs.open(bucket / obj_name, 'wb') as s3_file: + data = np.arange(NUM_ITEMS, dtype=d).tobytes() + if filter == "shuffle": + data = numcodecs.Shuffle(d.n_bytes()).encode(data) + if compression == "gzip": + data = gzip.compress(data) + elif compression == "zlib": + data = zlib.compress(data) + s3_file.write(data) + +print("Data upload successful. \nBucket contents:\n", "\n".join(s3_fs.ls(bucket))) diff --git a/src/filter_pipeline.rs b/src/filter_pipeline.rs index 2337d3d..63e3f01 100644 --- a/src/filter_pipeline.rs +++ b/src/filter_pipeline.rs @@ -2,6 +2,7 @@ use crate::compression; use crate::error::ActiveStorageError; +use crate::filters; use crate::models; use axum::body::Bytes; @@ -13,24 +14,31 @@ use axum::body::Bytes; /// # Arguments /// /// * `request_data`: RequestData object for the request -/// * `data`: Data to apply filter pipeline to. +/// * `data`: Data [Bytes](axum::body::Bytes) to apply the pipeline to. pub fn filter_pipeline( request_data: &models::RequestData, data: &Bytes, ) -> Result { + // Make a mutable shallow copy of the data. + let mut data = data.clone(); + // First decompress. if let Some(compression) = request_data.compression { - compression::decompress(compression, data) - } else { - Ok(data.clone()) - } - // TODO: Defilter + data = compression::decompress(compression, &data)? + }; + // Then decode the filters in reverse order. + if let Some(filters) = &request_data.filters { + for filter in filters.iter().rev() { + data = filters::decode(filter, &data)? + } + }; + Ok(data) } #[cfg(test)] mod tests { use super::*; use crate::test_utils; - use flate2::read::GzEncoder; + use flate2::read::{GzEncoder, ZlibEncoder}; use flate2::Compression; use std::io::Read; @@ -42,6 +50,14 @@ mod tests { result.into() } + fn compress_zlib(data: &[u8]) -> Bytes { + // Adapated from flate2 documentation. + let mut result = Vec::::new(); + let mut deflater = ZlibEncoder::new(data, Compression::fast()); + deflater.read_to_end(&mut result).unwrap(); + result.into() + } + #[test] fn test_filter_pipeline_noop() { let data = [1, 2, 3, 4]; @@ -60,4 +76,47 @@ mod tests { let result = filter_pipeline(&request_data, &bytes).unwrap(); assert_eq!(data.as_ref(), result); } + + #[test] + fn test_filter_pipeline_shuffle() { + let data = [1, 2, 3, 4, 5, 6, 7, 8]; + let bytes = Bytes::copy_from_slice(&data); + let shuffled = filters::shuffle::test_utils::shuffle(&bytes, 4); + let mut request_data = test_utils::get_test_request_data(); + request_data.filters = Some(vec![models::Filter::Shuffle { element_size: 4 }]); + let result = filter_pipeline(&request_data, &shuffled).unwrap(); + assert_eq!(data.as_ref(), result); + } + + #[test] + fn test_filter_pipeline_shuffle_zlib() { + let data: [u8; 8] = [1, 2, 3, 4, 5, 6, 7, 8]; + let bytes = Bytes::copy_from_slice(&data); + let shuffled = filters::shuffle::test_utils::shuffle(&bytes, 4); + let bytes = compress_zlib(shuffled.as_ref()); + let mut request_data = test_utils::get_test_request_data(); + request_data.compression = Some(models::Compression::Zlib); + request_data.filters = Some(vec![models::Filter::Shuffle { element_size: 4 }]); + let result = filter_pipeline(&request_data, &bytes).unwrap(); + assert_eq!(data.as_ref(), result.as_ref()); + } + + #[test] + fn test_filter_pipeline_shuffle_x2_zlib() { + // Test multiple filters. + // Currently we only have shuffle, so run it twice with different element types. + let data: [u8; 8] = [1, 2, 3, 4, 5, 6, 7, 8]; + let bytes = Bytes::copy_from_slice(&data); + let shuffled = filters::shuffle::test_utils::shuffle(&bytes, 4); + let reshuffled = filters::shuffle::test_utils::shuffle(&shuffled, 2); + let bytes = compress_zlib(reshuffled.as_ref()); + let mut request_data = test_utils::get_test_request_data(); + request_data.compression = Some(models::Compression::Zlib); + request_data.filters = Some(vec![ + models::Filter::Shuffle { element_size: 4 }, + models::Filter::Shuffle { element_size: 2 }, + ]); + let result = filter_pipeline(&request_data, &bytes).unwrap(); + assert_eq!(data.as_ref(), result.as_ref()); + } } diff --git a/src/filters.rs b/src/filters.rs new file mode 100644 index 0000000..a10585c --- /dev/null +++ b/src/filters.rs @@ -0,0 +1,36 @@ +//! Filter implementations. + +pub mod shuffle; + +use crate::error::ActiveStorageError; +use crate::models; + +use axum::body::Bytes; + +/// Decodes some bytes using the specified filter and returns the result. +/// +/// # Arguments +/// +/// * `filter`: Filter algorithm +/// * `data`: Filtered data [Bytes](axum::body::Bytes) +pub fn decode(filter: &models::Filter, data: &Bytes) -> Result { + match filter { + models::Filter::Shuffle { element_size } => Ok(shuffle::deshuffle(data, *element_size)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::filters; + + #[test] + fn test_decode_shuffle() { + let data = [1, 2, 3, 4, 5, 6, 7, 8]; + let bytes = Bytes::copy_from_slice(&data); + let shuffled = filters::shuffle::test_utils::shuffle(&bytes, 4); + let filter = models::Filter::Shuffle { element_size: 4 }; + let result = decode(&filter, &shuffled).unwrap(); + assert_eq!(data.as_ref(), result); + } +} diff --git a/src/filters/shuffle.rs b/src/filters/shuffle.rs new file mode 100644 index 0000000..11f69b3 --- /dev/null +++ b/src/filters/shuffle.rs @@ -0,0 +1,154 @@ +//! Byte shuffle filter + +use axum::body::Bytes; + +/// Decode the byte shuffle filter. +/// +/// The byte shuffle filter encodes data by reordering bytes with the aim of improving compression +/// ratio. For an array of N elements where each element is M bytes, the filter writes the 0th byte +/// of each element first, followed by the 1st byte of each element, and so on. This function +/// inverts the shuffle filter. +/// +/// This implementation was inspired by the HDF5 and Zarr shuffle filter implementations. +/// +/// # Arguments +/// +/// * `data`: `Bytes` to deshuffle. +/// * `element_size`: Size of each element in bytes. +// Benchmarking showed that the "slow" vector initialisation was faster for the non-unrolled case. +#[allow(clippy::slow_vector_initialization)] +pub fn deshuffle(data: &Bytes, element_size: usize) -> Bytes { + assert_eq!(data.len() % element_size, 0); + let mut result = Vec::with_capacity(data.len()); + // Convert the Vec to a mutable u8 slice to allow indexing. + // This was benchmarked in benches/shuffle.rs and provides ~50-100% improvement in wall clock + // time. + result.resize(data.len(), 0); + let m = result.as_mut_slice(); + let num_elements = data.len() / element_size; + // Unroll the inner loop when element size is 4 or 8. + // This was benchmarked in benches/shuffle.rs and provides ~50% improvement in wall clock time. + let mut dest_index = 0; + if element_size == 4 { + for i in 0..num_elements { + let mut src_index = i; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + dest_index += 1; + } + } else if element_size == 8 { + for i in 0..num_elements { + let mut src_index = i; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + m[dest_index] = data[src_index]; + dest_index += 1; + } + } else { + for i in 0..num_elements { + let mut src_index = i; + for _ in 0..element_size { + m[dest_index] = data[src_index]; + src_index += num_elements; + dest_index += 1; + } + } + } + result.into() +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + + #[test] + fn test_deshuffle_2() { + let shuffled = [0, 2, 4, 6, 1, 3, 5, 7]; + let bytes = Bytes::copy_from_slice(&shuffled); + let result = deshuffle(&bytes, 2); + let expected = [0, 1, 2, 3, 4, 5, 6, 7]; + assert_eq!(expected.as_ref(), result); + } + + #[test] + fn test_deshuffle_4() { + let shuffled = [0, 4, 1, 5, 2, 6, 3, 7]; + let bytes = Bytes::copy_from_slice(&shuffled); + let result = deshuffle(&bytes, 4); + let expected = [0, 1, 2, 3, 4, 5, 6, 7]; + assert_eq!(expected.as_ref(), result); + } + + #[test] + fn test_deshuffle_8() { + let shuffled = [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]; + let bytes = Bytes::copy_from_slice(&shuffled); + let result = deshuffle(&bytes, 8); + let expected = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + assert_eq!(expected.as_ref(), result); + } +} + +#[cfg(test)] +pub(crate) mod test_utils { + use super::*; + + // Shuffle isn't required for the server, but is useful for testing. + pub(crate) fn shuffle(data: &Bytes, element_size: usize) -> Bytes { + assert_eq!(data.len() % element_size, 0); + let mut result = Vec::with_capacity(data.len()); + for i in 0..element_size { + let mut src_index = i; + for _ in 0..data.len() / element_size { + result.push(data[src_index]); + src_index += element_size; + } + } + result.into() + } + + #[test] + fn test_shuffle_4() { + let data = [0, 1, 2, 3, 4, 5, 6, 7]; + let bytes = Bytes::copy_from_slice(&data); + let result = shuffle(&bytes, 4); + let expected = [0, 4, 1, 5, 2, 6, 3, 7]; + assert_eq!(expected.as_ref(), result); + } + + #[test] + fn test_shuffle_8() { + let data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + let bytes = Bytes::copy_from_slice(&data); + let result = shuffle(&bytes, 8); + let expected = [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]; + assert_eq!(expected.as_ref(), result); + } +} diff --git a/src/lib.rs b/src/lib.rs index ad7c936..31fb98b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,7 @@ pub mod cli; pub mod compression; pub mod error; pub mod filter_pipeline; +pub mod filters; pub mod metrics; pub mod models; pub mod operation; diff --git a/src/models.rs b/src/models.rs index 9e1cfbf..0a3023d 100644 --- a/src/models.rs +++ b/src/models.rs @@ -96,6 +96,15 @@ pub enum Compression { Zlib, } +/// Filter algorithm +#[derive(Clone, Copy, Debug, Deserialize, PartialEq)] +#[serde(rename_all = "lowercase")] +#[serde(tag = "id")] +pub enum Filter { + /// Byte shuffle + Shuffle { element_size: usize }, +} + /// Request data for operations #[derive(Debug, Deserialize, PartialEq, Validate)] #[serde(deny_unknown_fields)] @@ -131,6 +140,8 @@ pub struct RequestData { pub selection: Option>, /// Compression filter name pub compression: Option, + /// List of filter algorithms + pub filters: Option>, } /// Validate an array shape @@ -338,6 +349,16 @@ mod tests { Token::Str("id"), Token::Str("gzip"), Token::MapEnd, + Token::Str("filters"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Map { len: None }, + Token::Str("id"), + Token::Str("shuffle"), + Token::Str("element_size"), + Token::U32(4), + Token::MapEnd, + Token::SeqEnd, Token::StructEnd, ], ); @@ -623,6 +644,26 @@ mod tests { ) } + #[test] + fn test_invalid_filter() { + assert_de_tokens_error::( + &[ + Token::Struct { + name: "RequestData", + len: 2, + }, + Token::Str("filters"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Map { len: None }, + Token::Str("id"), + Token::Str("foo"), + Token::MapEnd, + ], + "unknown variant `foo`, expected `shuffle`", + ) + } + #[test] fn test_unknown_field() { assert_de_tokens_error::(&[ @@ -630,7 +671,7 @@ mod tests { Token::Str("foo"), Token::StructEnd ], - "unknown field `foo`, expected one of `source`, `bucket`, `object`, `dtype`, `offset`, `size`, `shape`, `order`, `selection`, `compression`" + "unknown field `foo`, expected one of `source`, `bucket`, `object`, `dtype`, `offset`, `size`, `shape`, `order`, `selection`, `compression`, `filters`" ) } @@ -645,7 +686,7 @@ mod tests { #[test] fn test_json_optional_fields() { - let json = r#"{"source": "http://example.com", "bucket": "bar", "object": "baz", "dtype": "int32", "offset": 4, "size": 8, "shape": [2, 5], "order": "C", "selection": [[1, 2, 3], [4, 5, 6]], "compression": {"id": "gzip"}}"#; + let json = r#"{"source": "http://example.com", "bucket": "bar", "object": "baz", "dtype": "int32", "offset": 4, "size": 8, "shape": [2, 5], "order": "C", "selection": [[1, 2, 3], [4, 5, 6]], "compression": {"id": "gzip"}, "filters": [{"id": "shuffle", "element_size": 4}]}"#; let request_data = serde_json::from_str::(json).unwrap(); assert_eq!(request_data, test_utils::get_test_request_data_optional()); } diff --git a/src/test_utils.rs b/src/test_utils.rs index 541287e..d2e83e0 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -15,6 +15,7 @@ pub(crate) fn get_test_request_data() -> RequestData { order: None, selection: None, compression: None, + filters: None, } } @@ -31,5 +32,6 @@ pub(crate) fn get_test_request_data_optional() -> RequestData { order: Some(Order::C), selection: Some(vec![Slice::new(1, 2, 3), Slice::new(4, 5, 6)]), compression: Some(Compression::Gzip), + filters: Some(vec![Filter::Shuffle { element_size: 4 }]), } } From 0eaa66e44055ba36626a228d4a07b696ac85cc9a Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 13 Jul 2023 17:07:15 +0100 Subject: [PATCH 2/3] Add shuffle benchmark Benchmarks the byte shuffle filter algorithm. --- Cargo.lock | 282 ++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 5 + benches/shuffle.rs | 24 ++++ 3 files changed, 306 insertions(+), 5 deletions(-) create mode 100644 benches/shuffle.rs diff --git a/Cargo.lock b/Cargo.lock index abf8aa4..ec26fd8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.3.2" @@ -104,6 +110,17 @@ dependencies = [ "syn 2.0.26", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -547,6 +564,12 @@ dependencies = [ "either", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.0.79" @@ -559,6 +582,45 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "ciborium" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" + +[[package]] +name = "ciborium-ll" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "3.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" +dependencies = [ + "bitflags 1.3.2", + "clap_lex 0.2.4", + "indexmap", + "textwrap", +] + [[package]] name = "clap" version = "4.3.16" @@ -578,7 +640,7 @@ checksum = "5ae467cbb0111869b765e13882a1dbbd6cb52f58203d8b80c44f667d4dd19843" dependencies = [ "anstream", "anstyle", - "clap_lex", + "clap_lex 0.5.0", "strsim", ] @@ -594,6 +656,15 @@ dependencies = [ "syn 2.0.26", ] +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", +] + [[package]] name = "clap_lex" version = "0.5.0" @@ -655,6 +726,76 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +dependencies = [ + "anes", + "atty", + "cast", + "ciborium", + "clap 3.2.25", + "criterion-plot", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.16" @@ -864,6 +1005,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + [[package]] name = "hashbrown" version = "0.12.3" @@ -901,6 +1048,15 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "hermit-abi" version = "0.3.2" @@ -1042,7 +1198,7 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.2", "libc", "windows-sys 0.48.0", ] @@ -1053,7 +1209,7 @@ version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.2", "rustix 0.38.4", "windows-sys 0.48.0", ] @@ -1168,6 +1324,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + [[package]] name = "mime" version = "0.3.17" @@ -1275,7 +1440,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.2", "libc", ] @@ -1294,12 +1459,24 @@ version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "os_str_bytes" +version = "6.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac" + [[package]] name = "outref" version = "0.5.1" @@ -1373,6 +1550,34 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "plotters" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" + +[[package]] +name = "plotters-svg" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +dependencies = [ + "plotters-backend", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -1503,6 +1708,28 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" +[[package]] +name = "rayon" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + [[package]] name = "redox_syscall" version = "0.1.57" @@ -1699,7 +1926,8 @@ dependencies = [ "aws-types", "axum", "axum-server", - "clap", + "clap 4.3.16", + "criterion", "expanduser", "flate2", "http", @@ -1728,6 +1956,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.22" @@ -1968,6 +2205,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "textwrap" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" + [[package]] name = "thiserror" version = "1.0.43" @@ -2024,6 +2267,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -2334,6 +2587,16 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "walkdir" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2445,6 +2708,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index a44df91..aba4137 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,5 +43,10 @@ validator = { version = "0.16", features = ["derive"] } zerocopy = { version = "0.6.1", features = ["alloc", "simd"] } [dev-dependencies] +criterion = { version = "0.4", features = ["html_reports"] } regex = "1" serde_test = "1.0" + +[[bench]] +name = "shuffle" +harness = false diff --git a/benches/shuffle.rs b/benches/shuffle.rs new file mode 100644 index 0000000..3ab6951 --- /dev/null +++ b/benches/shuffle.rs @@ -0,0 +1,24 @@ +/// Benchmarks for the byte shuffle filter implementation. +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use s3_active_storage::filters::shuffle; + +fn criterion_benchmark(c: &mut Criterion) { + for size_k in [64, 256, 1024] { + let size = size_k * 1024; + let data: Vec = (0_u32..size) + .map(|i| u8::try_from(i % 256).unwrap()) + .collect::>(); + let bytes = data.into(); + for element_size in [2, 4, 8] { + let name = format!("deshuffle({}, {})", size, element_size); + c.bench_function(&name, |b| { + b.iter(|| { + shuffle::deshuffle(black_box(&bytes), element_size); + }) + }); + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 70a74e4b33007c7797148aad93221b4183090fdc Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 19 Jul 2023 14:13:27 +0100 Subject: [PATCH 3/3] Allow GHSA-g98v-hv3f-hcfr: atty potential unaligned read This only affects Windows. --- .github/workflows/pull-request.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 4a5ae95..2dd3670 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -100,3 +100,6 @@ jobs: - name: Dependency Review uses: actions/dependency-review-action@v3 + with: + # https://github.com/advisories/GHSA-g98v-hv3f-hcfr atty potential unaligned read on Windows + allow-ghsas: GHSA-g98v-hv3f-hcfr