From 25ed63d9fba2886df7a825c7833d237676041d13 Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Mon, 8 Sep 2025 15:36:58 +0100 Subject: [PATCH 1/5] Import tendril crate Signed-off-by: Nico Burns --- .github/workflows/main.yml | 4 + Cargo.toml | 23 +- tendril/Cargo.toml | 27 + tendril/LICENSE-APACHE | 201 +++ tendril/LICENSE-MIT | 25 + tendril/README.md | 96 ++ tendril/examples/fuzz.rs | 146 +++ tendril/src/bench.rs | 159 +++ tendril/src/buf32.rs | 120 ++ tendril/src/fmt.rs | 519 ++++++++ tendril/src/futf.rs | 565 ++++++++ tendril/src/lib.rs | 37 + tendril/src/stream.rs | 752 +++++++++++ tendril/src/tendril.rs | 2473 ++++++++++++++++++++++++++++++++++++ tendril/src/utf8_decode.rs | 98 ++ tendril/src/util.rs | 45 + 16 files changed, 5280 insertions(+), 10 deletions(-) create mode 100644 tendril/Cargo.toml create mode 100644 tendril/LICENSE-APACHE create mode 100644 tendril/LICENSE-MIT create mode 100644 tendril/README.md create mode 100644 tendril/examples/fuzz.rs create mode 100644 tendril/src/bench.rs create mode 100644 tendril/src/buf32.rs create mode 100644 tendril/src/fmt.rs create mode 100644 tendril/src/futf.rs create mode 100644 tendril/src/lib.rs create mode 100644 tendril/src/stream.rs create mode 100644 tendril/src/tendril.rs create mode 100644 tendril/src/utf8_decode.rs create mode 100644 tendril/src/util.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4ebf60c1..a6480d29 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,6 +34,10 @@ jobs: if: matrix.version != 'nightly' run: cargo test --all + - name: Test tendril w/encoding feature + if: matrix.version != 'nightly' + run: cargo test -p tendril --features 'encoding encoding_rs' + - name: Cargo doc if: matrix.version == 'nightly' run: cargo doc diff --git a/Cargo.toml b/Cargo.toml index 78a583ef..390a6ec9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "html5ever", "rcdom", "xml5ever", + "tendril", ] [workspace.package] @@ -18,29 +19,31 @@ rust-version = "1.70.0" [workspace.dependencies] # Repo dependencies +tendril = { version = "0.4.3", path = "tendril" } web_atoms = { version = "0.1", path = "web_atoms" } markup5ever = { version = "0.35.0", path = "markup5ever" } xml5ever = { version = "0.35.0", path = "xml5ever" } html5ever = { version = "0.35.0", path = "html5ever" } # External dependencies -syn = { version = "2", features = ["full"] } -quote = "1" -proc-macro2 = "1" +encoding = "0.2" +encoding_rs = "0.8.12" log = "0.4" mac = "0.1" -tendril = "0.4" -string_cache = "0.9.0" -string_cache_codegen = "0.6.1" +new_debug_unreachable = "1.0.2" phf = "0.13" phf_codegen = "0.13" +proc-macro2 = "1" +quote = "1" +syn = { version = "2", features = ["full"] } +string_cache = "0.9.0" +string_cache_codegen = "0.6.1" +utf-8 = "0.7" # Dev dependencies criterion = "0.6" +env_logger = "0.10" libtest-mimic = "0.8.1" +rand = "0.4" serde_json = "1.0" -env_logger = "0.10" typed-arena = "2.0.2" - - - diff --git a/tendril/Cargo.toml b/tendril/Cargo.toml new file mode 100644 index 00000000..c424ff56 --- /dev/null +++ b/tendril/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "tendril" +version = "0.4.3" +description = "Compact buffer/string type for zero-copy parsing" +authors = [ + "Keegan McAllister ", + "Simon Sapin ", + "Chris Morgan " +] +license.workspace = true +repository.workspace = true +rust-version.workspace = true +readme = "README.md" +edition = "2015" + +[dependencies] +encoding = { workspace = true, optional = true} +encoding_rs = { workspace = true, optional = true} +mac = { workspace = true } +new_debug_unreachable = { workspace = true } +utf-8 = { workspace = true } + +[dev-dependencies] +rand = { workspace = true } + +[features] +bench = [] diff --git a/tendril/LICENSE-APACHE b/tendril/LICENSE-APACHE new file mode 100644 index 00000000..16fe87b0 --- /dev/null +++ b/tendril/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/tendril/LICENSE-MIT b/tendril/LICENSE-MIT new file mode 100644 index 00000000..2e0fee10 --- /dev/null +++ b/tendril/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2015 Keegan McAllister + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/tendril/README.md b/tendril/README.md new file mode 100644 index 00000000..700e5952 --- /dev/null +++ b/tendril/README.md @@ -0,0 +1,96 @@ +# tendril + +**Warning**: This library is at a very early stage of development, and it +contains a substantial amount of `unsafe` code. Use at your own risk! + +[![Build Status](https://github.com/servo/html5ever/workflows/Tendril%20CI/badge.svg)](https://github.com/servo/tendril/actions) + +[API Documentation](https://doc.servo.org/tendril/index.html) + +## Introduction + +`Tendril` is a compact string/buffer type, optimized for zero-copy parsing. +Tendrils have the semantics of owned strings, but are sometimes views into +shared buffers. When you mutate a tendril, an owned copy is made if necessary. +Further mutations occur in-place until the string becomes shared, e.g. with +`clone()` or `subtendril()`. + +Buffer sharing is accomplished through thread-local (non-atomic) reference +counting, which has very low overhead. The Rust type system will prevent +you at compile time from sending a tendril between threads. (See below +for thoughts on relaxing this restriction.) + +Whereas `String` allocates in the heap for any non-empty string, `Tendril` can +store small strings (up to 8 bytes) in-line, without a heap allocation. +`Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes versus +24. `Option` is the same size as `Tendril`, thanks to +[`NonZero`][NonZero]. + +The maximum length of a tendril is 4 GB. The library will panic if you attempt +to go over the limit. + +## Formats and encoding + +`Tendril` uses +[phantom types](https://doc.rust-lang.org/stable/rust-by-example/generics/phantom.html) +to track a buffer's format. This determines at compile time which +operations are available on a given tendril. For example, `Tendril` and +`Tendril` can be borrowed as `&str` and `&[u8]` respectively. + +`Tendril` also integrates with +[rust-encoding](https://github.com/lifthrasiir/rust-encoding) and has +preliminary support for [WTF-8][] buffers. + +## Plans for the future + +### Ropes + +[html5ever][] will use `Tendril` as a zero-copy text representation. It would +be good to preserve this all the way through to Servo's DOM. This would reduce +memory consumption, and possibly speed up text shaping and painting. However, +DOM text may conceivably be larger than 4 GB, and will anyway not be contiguous +in memory around e.g. a character entity reference. + +*Solution:* Build a **[rope][] on top of these strings** and use that as +Servo's representation of DOM text. We can perhaps do text shaping and/or +painting in parallel for different chunks of a rope. html5ever can additionally +use this rope type as a replacement for `BufferQueue`. + +Because the underlying buffers are reference-counted, the bulk of this rope +is already a [persistent data structure][]. Consider what happens when +appending two ropes to get a "new" rope. A vector-backed rope would copy a +vector of small structs, one for each chunk, and would bump the corresponding +refcounts. But it would not copy any of the string data. + +If we want more sharing, then a [2-3 finger tree][] could be a good choice. +We would probably stick with `VecDeque` for ropes under a certain size. + +### UTF-16 compatibility + +SpiderMonkey expects text to be in UCS-2 format for the most part. The +semantics of JavaScript strings are difficult to implement on UTF-8. This also +applies to HTML parsing via `document.write`. Also, passing SpiderMonkey a +string that isn't contiguous in memory will incur additional overhead and +complexity, if not a full copy. + +*Solution:* Use **WTF-8 in parsing** and in the DOM. Servo will **convert to +contiguous UTF-16 when necessary**. The conversion can easily be parallelized, +if we find a practical need to convert huge chunks of text all at once. + +### Source span information + +Some html5ever API consumers want to know the originating location in the HTML +source file(s) of each token or parse error. An example application would be a +command-line HTML validator with diagnostic output similar to `rustc`'s. + +*Solution:* Accept **some metadata along with each input string**. The type of +metadata is chosen by the API consumer; it defaults to `()`, which has size +zero. For any non-inline string, we can provide the associated metadata as well +as a byte offset. + +[NonZero]: https://doc.rust-lang.org/core/nonzero/struct.NonZero.html +[html5ever]: https://github.com/servo/html5ever +[WTF-8]: https://simonsapin.github.io/wtf-8/ +[rope]: https://en.wikipedia.org/wiki/Rope_%28data_structure%29 +[persistent data structure]: https://en.wikipedia.org/wiki/Persistent_data_structure +[2-3 finger tree]: https://www.staff.city.ac.uk/~ross/papers/FingerTree.html diff --git a/tendril/examples/fuzz.rs b/tendril/examples/fuzz.rs new file mode 100644 index 00000000..37daf560 --- /dev/null +++ b/tendril/examples/fuzz.rs @@ -0,0 +1,146 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! A simple fuzz tester for the library. + +#![deny(warnings)] + +extern crate rand; +extern crate tendril; + +use std::borrow::ToOwned; + +use rand::distributions::{IndependentSample, Range}; +use rand::Rng; +use tendril::StrTendril; + +fn fuzz() { + let mut rng = rand::thread_rng(); + let capacity = Range::new(0u32, 1 << 14).ind_sample(&mut rng); + let mut buf_string = String::with_capacity(capacity as usize); + let mut buf_tendril = StrTendril::with_capacity(capacity); + let mut string_slices = vec![]; + let mut tendril_slices = vec![]; + + for _ in 1..100_000 { + if buf_string.len() > (1 << 30) { + buf_string.truncate(0); + buf_tendril.clear(); + } + + let dist_action = Range::new(0, 100); + match dist_action.ind_sample(&mut rng) { + 0..=15 => { + let (start, end) = random_slice(&mut rng, TEXT); + let snip = &TEXT[start..end]; + buf_string.push_str(snip); + buf_tendril.push_slice(snip); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 16..=31 => { + let (start, end) = random_slice(&mut rng, &buf_string); + let snip = &buf_string[start..end].to_owned(); + buf_string.push_str(&snip); + buf_tendril.push_slice(&snip); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 32..=47 => { + let lenstr = format!("[length = {}]", buf_tendril.len()); + buf_string.push_str(&lenstr); + buf_tendril.push_slice(&lenstr); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 48..=63 => { + let n = random_boundary(&mut rng, &buf_string); + buf_tendril.pop_front(n as u32); + buf_string = buf_string[n..].to_owned(); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 64..=79 => { + let new_len = random_boundary(&mut rng, &buf_string); + let n = buf_string.len() - new_len; + buf_string.truncate(new_len); + buf_tendril.pop_back(n as u32); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 80..=90 => { + let (start, end) = random_slice(&mut rng, &buf_string); + buf_string = buf_string[start..end].to_owned(); + buf_tendril = buf_tendril.subtendril(start as u32, (end - start) as u32); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 91..=96 => { + let c = rng.gen(); + buf_string.push(c); + assert!(buf_tendril.try_push_char(c).is_ok()); + assert_eq!(&*buf_string, &*buf_tendril); + } + + 97 => { + buf_string.truncate(0); + buf_tendril.clear(); + assert_eq!(&*buf_string, &*buf_tendril); + } + + _ => { + let (start, end) = random_slice(&mut rng, &buf_string); + string_slices.push(buf_string[start..end].to_owned()); + tendril_slices.push(buf_tendril.subtendril(start as u32, (end - start) as u32)); + assert_eq!(string_slices.len(), tendril_slices.len()); + assert!(string_slices + .iter() + .zip(tendril_slices.iter()) + .all(|(s, t)| **s == **t)); + } + } + } +} + +fn random_boundary(rng: &mut R, text: &str) -> usize { + loop { + let i = Range::new(0, text.len() + 1).ind_sample(rng); + if text.is_char_boundary(i) { + return i; + } + } +} + +fn random_slice(rng: &mut R, text: &str) -> (usize, usize) { + loop { + let start = Range::new(0, text.len() + 1).ind_sample(rng); + let end = Range::new(start, text.len() + 1).ind_sample(rng); + if !text.is_char_boundary(start) { + continue; + } + if end < text.len() && !text.is_char_boundary(end) { + continue; + } + return (start, end); + } +} + +static TEXT: &'static str = + "It was from the artists and poets that the pertinent answers came, and I \ + know that panic would have broken loose had they been able to compare notes. \ + As it was, lacking their original letters, I half suspected the compiler of \ + having asked leading questions, or of having edited the correspondence in \ + corroboration of what he had latently resolved to see.\ +\ + ˙ǝǝs oʇ pǝʌʃosǝɹ ʎʃʇuǝʇɐʃ pɐɥ ǝɥ ʇɐɥʍ ɟo uoıʇɐɹoqoɹɹoɔ uı ǝɔuǝpuodsǝɹɹoɔ ǝɥʇ \ + pǝʇıpǝ ƃuıʌɐɥ ɟo ɹo 'suoıʇsǝnb ƃuıpɐǝʃ pǝʞsɐ ƃuıʌɐɥ ɟo ɹǝʃıdɯoɔ ǝɥʇ pǝʇɔǝdsns \ + ɟʃɐɥ I 'sɹǝʇʇǝʃ ʃɐuıƃıɹo ɹıǝɥʇ ƃuıʞɔɐʃ 'sɐʍ ʇı s∀ ˙sǝʇou ǝɹɐdɯoɔ oʇ ǝʃqɐ uǝǝq \ + ʎǝɥʇ pɐɥ ǝsooʃ uǝʞoɹq ǝʌɐɥ pʃnoʍ ɔıuɐd ʇɐɥʇ ʍouʞ I puɐ 'ǝɯɐɔ sɹǝʍsuɐ ʇuǝuıʇɹǝd \ + ǝɥʇ ʇɐɥʇ sʇǝod puɐ sʇsıʇɹɐ ǝɥʇ ɯoɹɟ sɐʍ ʇI"; + +fn main() { + fuzz(); +} diff --git a/tendril/src/bench.rs b/tendril/src/bench.rs new file mode 100644 index 00000000..a9d2c30a --- /dev/null +++ b/tendril/src/bench.rs @@ -0,0 +1,159 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::ToOwned; +use std::collections::hash_map::{Entry, HashMap}; + +use tendril::StrTendril; + +fn index_words_string(input: &String) -> HashMap> { + let mut index = HashMap::new(); + for word in input.split(|c| c == ' ') { + if word.len() == 0 { + continue; + } + let word = word.to_owned(); + match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + let x: &mut Vec = e.get_mut(); + x.push(word); + } + Entry::Vacant(e) => { + e.insert(vec![word]); + } + } + } + index +} + +fn index_words_tendril(input: &StrTendril) -> HashMap> { + let mut index = HashMap::new(); + let mut t = input.clone(); + loop { + match t.pop_front_char_run(|c| c != ' ') { + None => return index, + Some((_, false)) => (), + Some((word, true)) => match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + e.get_mut().push(word); + } + Entry::Vacant(e) => { + e.insert(vec![word]); + } + }, + } + } +} + +static EN_1: &'static str = "Days turn to nights turn to paper into rocks into plastic"; + +static EN_2: &'static str = + "Here the notes in my laboratory journal cease. I was able to write the last \ + words only with great effort. By now it was already clear to me that LSD had \ + been the cause of the remarkable experience of the previous Friday, for the \ + altered perceptions were of the same type as before, only much more intense. I \ + had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ + informed of the self-experiment, to escort me home. We went by bicycle, no \ + automobile being available because of wartime restrictions on their use. On the \ + way home, my condition began to assume threatening forms. Everything in my \ + field of vision wavered and was distorted as if seen in a curved mirror. I also \ + had the sensation of being unable to move from the spot. Nevertheless, my \ + assistant later told me that we had traveled very rapidly. Finally, we arrived \ + at home safe and sound, and I was just barely capable of asking my companion to \ + summon our family doctor and request milk from the neighbors.\n\n\ + In spite of my delirious, bewildered condition, I had brief periods of clear \ + and effective thinking—and chose milk as a nonspecific antidote for poisoning."; + +static KR_1: &'static str = + "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ + 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ + 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; + +static HTML_KR_1: &'static str = + "

러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, \ + 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ + 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.

"; + +mod index_words { + macro_rules! bench { + ($txt:ident) => { + #[allow(non_snake_case)] + mod $txt { + const SMALL_SIZE: usize = 65536; + const LARGE_SIZE: usize = (1 << 20); + + #[bench] + fn index_words_string(b: &mut ::test::Bencher) { + let mut s = String::new(); + while s.len() < SMALL_SIZE { + s.push_str(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_string(&s)); + } + + #[bench] + fn index_words_tendril(b: &mut ::test::Bencher) { + let mut t = ::tendril::StrTendril::new(); + while t.len() < SMALL_SIZE { + t.push_slice(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_tendril(&t)); + } + + #[bench] + fn index_words_big_string(b: &mut ::test::Bencher) { + let mut s = String::new(); + while s.len() < LARGE_SIZE { + s.push_str(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_string(&s)); + } + + #[bench] + fn index_words_big_tendril(b: &mut ::test::Bencher) { + let mut t = ::tendril::StrTendril::new(); + while t.len() < LARGE_SIZE { + t.push_slice(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_tendril(&t)); + } + + #[test] + fn correctness() { + use std::borrow::ToOwned; + use tendril::bench::{index_words_string, index_words_tendril}; + use tendril::SliceExt; + + let txt = ::tendril::bench::$txt; + let input_string = txt.to_owned(); + let count_s = index_words_string(&input_string); + let mut keys: Vec = count_s.keys().cloned().collect(); + keys.sort(); + + let input_tendril = txt.to_tendril(); + let count_t = index_words_tendril(&input_tendril); + let mut keys_t: Vec = count_t.keys().cloned().collect(); + keys_t.sort(); + + assert_eq!(keys, keys_t); + + for k in &keys { + let vs = &count_s[k]; + let vt = &count_t[k]; + assert_eq!(vs.len(), vt.len()); + assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); + } + } + } + }; + } + + bench!(EN_1); + bench!(EN_2); + bench!(KR_1); + bench!(HTML_KR_1); +} diff --git a/tendril/src/buf32.rs b/tendril/src/buf32.rs new file mode 100644 index 00000000..d60a277a --- /dev/null +++ b/tendril/src/buf32.rs @@ -0,0 +1,120 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Provides an unsafe owned buffer type, used in implementing `Tendril`. + +use std::{mem, ptr, slice, u32}; + +use OFLOW; + +pub const MIN_CAP: u32 = 16; + +pub const MAX_LEN: usize = u32::MAX as usize; + +/// A buffer points to a header of type `H`, which is followed by `MIN_CAP` or more +/// bytes of storage. +pub struct Buf32 { + pub ptr: *mut H, + pub len: u32, + pub cap: u32, +} + +#[inline(always)] +fn bytes_to_vec_capacity(x: u32) -> usize { + let header = mem::size_of::(); + debug_assert!(header > 0); + let x = (x as usize).checked_add(header).expect(OFLOW); + // Integer ceil https://stackoverflow.com/a/2745086/1162888 + 1 + ((x - 1) / header) +} + +impl Buf32 { + #[inline] + pub unsafe fn with_capacity(mut cap: u32, h: H) -> Buf32 { + if cap < MIN_CAP { + cap = MIN_CAP; + } + + let mut vec = Vec::::with_capacity(bytes_to_vec_capacity::(cap)); + let ptr = vec.as_mut_ptr(); + mem::forget(vec); + ptr::write(ptr, h); + + Buf32 { + ptr: ptr, + len: 0, + cap: cap, + } + } + + #[inline] + pub unsafe fn destroy(self) { + mem::drop(Vec::from_raw_parts( + self.ptr, + 1, + bytes_to_vec_capacity::(self.cap), + )); + } + + #[inline(always)] + pub unsafe fn data_ptr(&self) -> *mut u8 { + (self.ptr as *mut u8).offset(mem::size_of::() as isize) + } + + #[inline(always)] + pub unsafe fn data(&self) -> &[u8] { + slice::from_raw_parts(self.data_ptr(), self.len as usize) + } + + #[inline(always)] + pub unsafe fn data_mut(&mut self) -> &mut [u8] { + slice::from_raw_parts_mut(self.data_ptr(), self.len as usize) + } + + /// Grow the capacity to at least `new_cap`. + /// + /// This will panic if the capacity calculation overflows `u32`. + #[inline] + pub unsafe fn grow(&mut self, new_cap: u32) { + if new_cap <= self.cap { + return; + } + + let new_cap = new_cap.checked_next_power_of_two().expect(OFLOW); + let mut vec = Vec::from_raw_parts(self.ptr, 0, bytes_to_vec_capacity::(self.cap)); + vec.reserve_exact(bytes_to_vec_capacity::(new_cap)); + self.ptr = vec.as_mut_ptr(); + self.cap = new_cap; + mem::forget(vec); + } +} + +#[cfg(test)] +mod test { + use super::Buf32; + use std::ptr; + + #[test] + fn smoke_test() { + unsafe { + let mut b = Buf32::with_capacity(0, 0u8); + assert_eq!(b"", b.data()); + + b.grow(5); + ptr::copy_nonoverlapping(b"Hello".as_ptr(), b.data_ptr(), 5); + + assert_eq!(b"", b.data()); + b.len = 5; + assert_eq!(b"Hello", b.data()); + + b.grow(1337); + assert!(b.cap >= 1337); + assert_eq!(b"Hello", b.data()); + + b.destroy(); + } + } +} diff --git a/tendril/src/fmt.rs b/tendril/src/fmt.rs new file mode 100644 index 00000000..2ff04bbc --- /dev/null +++ b/tendril/src/fmt.rs @@ -0,0 +1,519 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Marker types for formats. +//! +//! This module defines the types and traits used to mark a `Tendril` +//! with the format of data it contains. It includes those formats +//! for which `Tendril` supports at least some operations without +//! conversion. +//! +//! To convert a string tendril to/from a byte tendril in an arbitrary +//! character encoding, see the `encode` and `decode` methods on +//! `Tendril`. +//! +//! `Tendril` operations may become memory-unsafe if data invalid for +//! the format sneaks in. For that reason, these traits require +//! `unsafe impl`. + +use std::default::Default; +use std::{char, mem, str}; + +use futf::{self, Codepoint, Meaning}; + +/// Implementation details. +/// +/// You don't need these unless you are implementing +/// a new format. +pub mod imp { + use std::default::Default; + use std::{iter, mem, slice}; + + /// Describes how to fix up encodings when concatenating. + /// + /// We can drop characters on either side of the splice, + /// and insert up to 4 bytes in the middle. + pub struct Fixup { + pub drop_left: u32, + pub drop_right: u32, + pub insert_len: u32, + pub insert_bytes: [u8; 4], + } + + impl Default for Fixup { + #[inline(always)] + fn default() -> Fixup { + Fixup { + drop_left: 0, + drop_right: 0, + insert_len: 0, + insert_bytes: [0; 4], + } + } + } + + #[inline(always)] + unsafe fn from_u32_unchecked(n: u32) -> char { + mem::transmute(n) + } + + pub struct SingleByteCharIndices<'a> { + inner: iter::Enumerate>, + } + + impl<'a> Iterator for SingleByteCharIndices<'a> { + type Item = (usize, char); + + #[inline] + fn next(&mut self) -> Option<(usize, char)> { + self.inner + .next() + .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) + } + } + + impl<'a> SingleByteCharIndices<'a> { + #[inline] + pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { + SingleByteCharIndices { + inner: buf.iter().enumerate(), + } + } + } +} + +/// Trait for format marker types. +/// +/// The type implementing this trait is usually not instantiated. +/// It's used with a phantom type parameter of `Tendril`. +pub unsafe trait Format { + /// Check whether the buffer is valid for this format. + fn validate(buf: &[u8]) -> bool; + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a prefix of a valid buffer. + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + ::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a suffix of a valid buffer. + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + ::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a contiguous subsequence + /// of a valid buffer, but not necessarily a prefix or + /// a suffix. + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + ::validate(buf) + } + + /// Compute any fixup needed when concatenating buffers. + /// + /// The default is to do nothing. + /// + /// The function is `unsafe` because it may assume the input + /// buffers are already valid for the format. Also, no + /// bounds-checking is performed on the return value! + #[inline(always)] + unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { + Default::default() + } +} + +/// Indicates that one format is a subset of another. +/// +/// The subset format can be converted to the superset format +/// for free. +pub unsafe trait SubsetOf: Format +where + Super: Format, +{ + /// Validate the *other* direction of conversion; check if + /// this buffer from the superset format conforms to the + /// subset format. + /// + /// The default calls `Self::validate`, but some conversions + /// may implement a check which is cheaper than validating + /// from scratch. + fn revalidate_subset(x: &[u8]) -> bool { + Self::validate(x) + } +} + +/// Indicates a format which corresponds to a Rust slice type, +/// representing exactly the same invariants. +pub unsafe trait SliceFormat: Format + Sized { + type Slice: ?Sized + Slice; +} + +/// Indicates a format which contains characters from Unicode +/// (all of it, or some proper subset). +pub unsafe trait CharFormat<'a>: Format { + /// Iterator for characters and their byte indices. + type Iter: Iterator; + + /// Iterate over the characters of the string and their byte + /// indices. + /// + /// You may assume the buffer is *already validated* for `Format`. + unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; + + /// Encode the character as bytes and pass them to a continuation. + /// + /// Returns `Err(())` iff the character cannot be represented. + fn encode_char(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]); +} + +/// Indicates a Rust slice type that is represented in memory as bytes. +pub unsafe trait Slice { + /// Access the raw bytes of the slice. + fn as_bytes(&self) -> &[u8]; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_bytes(x: &[u8]) -> &Self; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; +} + +/// Marker type for uninterpreted bytes. +/// +/// Validation will never fail for this format. +#[derive(Copy, Clone, Default, Debug)] +pub struct Bytes; + +unsafe impl Format for Bytes { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } +} + +unsafe impl SliceFormat for Bytes { + type Slice = [u8]; +} + +unsafe impl Slice for [u8] { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + self + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &[u8] { + x + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { + x + } +} + +/// Marker type for ASCII text. +#[derive(Copy, Clone, Default, Debug)] +pub struct ASCII; + +unsafe impl Format for ASCII { + #[inline] + fn validate(buf: &[u8]) -> bool { + buf.iter().all(|&n| n <= 127) + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl SubsetOf for ASCII {} +unsafe impl SubsetOf for ASCII {} + +unsafe impl<'a> CharFormat<'a> for ASCII { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0x7F { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} + +/// Marker type for UTF-8 text. +#[derive(Copy, Clone, Default, Debug)] +pub struct UTF8; + +unsafe impl Format for UTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + str::from_utf8(buf).is_ok() + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + ::validate_prefix(buf) && ::validate_suffix(buf) + } +} + +unsafe impl SubsetOf for UTF8 {} + +unsafe impl SliceFormat for UTF8 { + type Slice = str; +} + +unsafe impl Slice for str { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + str::as_bytes(self) + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &str { + str::from_utf8_unchecked(x) + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { + mem::transmute(x) + } +} + +unsafe impl<'a> CharFormat<'a> for UTF8 { + type Iter = str::CharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { + str::from_utf8_unchecked(buf).char_indices() + } + + #[inline] + fn encode_char(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); + Ok(()) + } +} + +/// Marker type for WTF-8 text. +/// +/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). +#[derive(Copy, Clone, Default, Debug)] +pub struct WTF8; + +#[inline] +fn wtf8_meaningful(m: Meaning) -> bool { + match m { + Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, + _ => false, + } +} + +unsafe impl Format for WTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + let mut i = 0; + let mut prev_lead = false; + while i < buf.len() { + let codept = unwrap_or_return!(futf::classify(buf, i), false); + if !wtf8_meaningful(codept.meaning) { + return false; + } + i += codept.bytes.len(); + prev_lead = match codept.meaning { + Meaning::TrailSurrogate(_) if prev_lead => return false, + Meaning::LeadSurrogate(_) => true, + _ => false, + }; + } + + true + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + ::validate_prefix(buf) && ::validate_suffix(buf) + } + + #[inline] + unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { + const ERR: &'static str = "WTF8: internal error"; + + if lhs.len() >= 3 && rhs.len() >= 3 { + if let ( + Some(Codepoint { + meaning: Meaning::LeadSurrogate(hi), + .. + }), + Some(Codepoint { + meaning: Meaning::TrailSurrogate(lo), + .. + }), + ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) + { + let mut fixup = imp::Fixup { + drop_left: 3, + drop_right: 3, + insert_len: 0, + insert_bytes: [0_u8; 4], + }; + + let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); + + let ch = char::from_u32(n).expect(ERR); + fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; + + return fixup; + } + } + + Default::default() + } +} + +/// Marker type for the single-byte encoding of the first 256 Unicode codepoints. +/// +/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the +/// C0 and C1 control characters from ECMA-48 / ISO 6429. +/// +/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the +/// many other aliases), which actually stand for Windows-1252. +#[derive(Copy, Clone, Default, Debug)] +pub struct Latin1; + +unsafe impl Format for Latin1 { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl<'a> CharFormat<'a> for Latin1 { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0xFF { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} diff --git a/tendril/src/futf.rs b/tendril/src/futf.rs new file mode 100644 index 00000000..93a1c21e --- /dev/null +++ b/tendril/src/futf.rs @@ -0,0 +1,565 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::{char, slice}; + +/// Meaning of a complete or partial UTF-8 codepoint. +/// +/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or +/// `Suffix` may in reality have no valid completion. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] +pub enum Meaning { + /// We found a whole codepoint. + Whole(char), + + /// We found something that isn't a valid Unicode codepoint, but + /// it *would* correspond to a UTF-16 leading surrogate code unit, + /// i.e. a value in the range `U+D800` - `U+DBFF`. + /// + /// The argument is the code unit's 10-bit index within that range. + /// + /// These are found in UTF-8 variants such as CESU-8 and WTF-8. + LeadSurrogate(u16), + + /// We found something that isn't a valid Unicode codepoint, but + /// it *would* correspond to a UTF-16 trailing surrogate code unit, + /// i.e. a value in the range `U+DC00` - `U+DFFF`. + /// + /// The argument is the code unit's 10-bit index within that range. + /// + /// These are found in UTF-8 variants such as CESU-8 and WTF-8. + TrailSurrogate(u16), + + /// We found only a prefix of a codepoint before the buffer ended. + /// + /// Includes the number of additional bytes needed. + Prefix(usize), + + /// We found only a suffix of a codepoint before running off the + /// start of the buffer. + /// + /// Up to 3 more bytes may be needed. + Suffix, +} + +/// Represents a complete or partial UTF-8 codepoint. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] +pub struct Codepoint<'a> { + /// The bytes that make up the partial or full codepoint. + /// + /// For a `Suffix` this depends on `idx`. We don't scan forward + /// for additional continuation bytes after the reverse scan + /// failed to locate a multibyte sequence start. + pub bytes: &'a [u8], + + /// Start of the codepoint in the buffer, expressed as an offset + /// back from `idx`. + pub rewind: usize, + + /// Meaning of the partial or full codepoint. + pub meaning: Meaning, +} + +#[derive(Debug, PartialEq, Eq)] +enum Byte { + Ascii, + Start(usize), + Cont, +} + +impl Byte { + #[inline(always)] + fn classify(x: u8) -> Option { + match x & 0xC0 { + 0xC0 => match x { + x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), + x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), + x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), + _ => None, + }, + 0x80 => Some(Byte::Cont), + _ => Some(Byte::Ascii), + } + } +} + +#[inline(always)] +fn all_cont(buf: &[u8]) -> bool { + buf.iter() + .all(|&b| matches!(Byte::classify(b), Some(Byte::Cont))) +} + +// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: +// a starting byte followed by the correct number of continuation bytes. +#[inline(always)] +unsafe fn decode(buf: &[u8]) -> Option { + debug_assert!(buf.len() >= 2); + debug_assert!(buf.len() <= 4); + let n; + match buf.len() { + 2 => { + n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 + | ((*buf.get_unchecked(1) & 0x3F) as u32); + if n < 0x80 { + return None; + } // Overlong + } + 3 => { + n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 + | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 + | ((*buf.get_unchecked(2) & 0x3F) as u32); + match n { + 0x0000..=0x07FF => return None, // Overlong + 0xD800..=0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), + 0xDC00..=0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), + _ => {} + } + } + 4 => { + n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 + | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 + | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 + | ((*buf.get_unchecked(3) & 0x3F) as u32); + if n < 0x1_0000 { + return None; + } // Overlong + } + _ => debug_unreachable!(), + } + + char::from_u32(n).map(Meaning::Whole) +} + +#[inline(always)] +unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) +} + +macro_rules! otry { + ($x:expr) => { + unwrap_or_return!($x, None) + }; +} + +/// Describes the UTF-8 codepoint containing the byte at index `idx` within +/// `buf`. +/// +/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 +/// in the vicinity of `idx`. +#[inline] +pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { + if idx >= buf.len() { + return None; + } + + unsafe { + let x = *buf.get_unchecked(idx); + match otry!(Byte::classify(x)) { + Byte::Ascii => Some(Codepoint { + bytes: unsafe_slice(buf, idx, 1), + rewind: 0, + meaning: Meaning::Whole(x as char), + }), + Byte::Start(n) => { + let avail = buf.len() - idx; + if avail >= n { + let bytes = unsafe_slice(buf, idx, n); + if !all_cont(unsafe_slice(bytes, 1, n - 1)) { + return None; + } + let meaning = otry!(decode(bytes)); + Some(Codepoint { + bytes: bytes, + rewind: 0, + meaning: meaning, + }) + } else { + Some(Codepoint { + bytes: unsafe_slice(buf, idx, avail), + rewind: 0, + meaning: Meaning::Prefix(n - avail), + }) + } + } + Byte::Cont => { + let mut start = idx; + let mut checked = 0; + loop { + if start == 0 { + // Whoops, fell off the beginning. + return Some(Codepoint { + bytes: unsafe_slice(buf, 0, idx + 1), + rewind: idx, + meaning: Meaning::Suffix, + }); + } + + start -= 1; + checked += 1; + match otry!(Byte::classify(*buf.get_unchecked(start))) { + Byte::Cont => (), + Byte::Start(n) => { + let avail = buf.len() - start; + if avail >= n { + let bytes = unsafe_slice(buf, start, n); + if checked < n { + if !all_cont(unsafe_slice(bytes, checked, n - checked)) { + return None; + } + } + let meaning = otry!(decode(bytes)); + return Some(Codepoint { + bytes: bytes, + rewind: idx - start, + meaning: meaning, + }); + } else { + return Some(Codepoint { + bytes: unsafe_slice(buf, start, avail), + rewind: idx - start, + meaning: Meaning::Prefix(n - avail), + }); + } + } + _ => return None, + } + + if idx - start >= 3 { + // We looked at 3 bytes before a continuation byte + // and didn't find a start byte. + return None; + } + } + } + } + } +} + +#[cfg(all(test, feature = "bench"))] +mod tests { + use super::{all_cont, classify, decode, Byte, Meaning}; + use std::borrow::ToOwned; + use std::io::Write; + use test::Bencher; + + #[test] + fn classify_all_bytes() { + for n in 0x00..0x80 { + assert_eq!(Byte::classify(n), Some(Byte::Ascii)); + } + for n in 0x80..0xC0 { + assert_eq!(Byte::classify(n), Some(Byte::Cont)); + } + for n in 0xC0..0xE0 { + assert_eq!(Byte::classify(n), Some(Byte::Start(2))); + } + for n in 0xE0..0xF0 { + assert_eq!(Byte::classify(n), Some(Byte::Start(3))); + } + for n in 0xF0..0xF8 { + assert_eq!(Byte::classify(n), Some(Byte::Start(4))); + } + for n in 0xF8..0xFF { + assert_eq!(Byte::classify(n), None); + } + assert_eq!(Byte::classify(0xFF), None); + } + + #[test] + fn test_all_cont() { + assert!(all_cont(b"")); + assert!(all_cont(b"\x80")); + assert!(all_cont(b"\xBF")); + assert!(all_cont(b"\x80\xBF\x80\xBF")); + + assert!(!all_cont(b"z")); + assert!(!all_cont(b"\xC0\xBF")); + assert!(!all_cont(b"\xFF")); + assert!(!all_cont(b"\x80\xBFz\x80\xBF")); + assert!(!all_cont(b"\x80\xBF\xC0\x80\xBF")); + assert!(!all_cont(b"\x80\xBF\xFF\x80\xBF")); + assert!(!all_cont(b"\x80\xBF\x80\xBFz")); + assert!(!all_cont(b"\x80\xBF\x80\xBF\xC0")); + assert!(!all_cont(b"z\x80\xBF\x80\xBF")); + assert!(!all_cont(b"\xC0\x80\xBF\x80\xBF")); + } + + #[test] + fn test_decode() { + unsafe { + assert_eq!(Some(Meaning::Whole('ő')), decode(b"\xC5\x91")); + assert_eq!(Some(Meaning::Whole('\u{a66e}')), decode(b"\xEA\x99\xAE")); + assert_eq!( + Some(Meaning::Whole('\u{1f4a9}')), + decode(b"\xF0\x9F\x92\xA9") + ); + assert_eq!( + Some(Meaning::Whole('\u{10ffff}')), + decode(b"\xF4\x8F\xBF\xBF") + ); + + assert_eq!( + Some(Meaning::LeadSurrogate(0x0000)), + decode(b"\xED\xA0\x80") + ); + assert_eq!( + Some(Meaning::LeadSurrogate(0x0001)), + decode(b"\xED\xA0\x81") + ); + assert_eq!( + Some(Meaning::LeadSurrogate(0x03FE)), + decode(b"\xED\xAF\xBE") + ); + assert_eq!( + Some(Meaning::LeadSurrogate(0x03FF)), + decode(b"\xED\xAF\xBF") + ); + + assert_eq!( + Some(Meaning::TrailSurrogate(0x0000)), + decode(b"\xED\xB0\x80") + ); + assert_eq!( + Some(Meaning::TrailSurrogate(0x0001)), + decode(b"\xED\xB0\x81") + ); + assert_eq!( + Some(Meaning::TrailSurrogate(0x03FE)), + decode(b"\xED\xBF\xBE") + ); + assert_eq!( + Some(Meaning::TrailSurrogate(0x03FF)), + decode(b"\xED\xBF\xBF") + ); + + // The last 4-byte UTF-8 sequence. This would be U+1FFFFF, which is out of + // range. + assert_eq!(None, decode(b"\xF7\xBF\xBF\xBF")); + + // First otherwise-valid sequence (would be U+110000) that is out of range + assert_eq!(None, decode(b"\xF4\x90\x80\x80")); + + // Overlong sequences + assert_eq!(None, decode(b"\xC0\x80")); + assert_eq!(None, decode(b"\xC1\xBF")); + assert_eq!(None, decode(b"\xE0\x80\x80")); + assert_eq!(None, decode(b"\xE0\x9F\xBF")); + assert_eq!(None, decode(b"\xF0\x80\x80\x80")); + assert_eq!(None, decode(b"\xF0\x8F\xBF\xBF")); + + // For not-overlong sequence for each sequence length + assert_eq!(Some(Meaning::Whole('\u{80}')), decode(b"\xC2\x80")); + assert_eq!(Some(Meaning::Whole('\u{800}')), decode(b"\xE0\xA0\x80")); + assert_eq!( + Some(Meaning::Whole('\u{10000}')), + decode(b"\xF0\x90\x80\x80") + ); + } + } + + static JUNK: &'static [u8] = b"\ + \xf8\x0d\x07\x25\xa6\x7b\x95\xeb\x47\x01\x7f\xee\ + \x3b\x00\x60\x57\x1d\x9e\x5d\x0a\x0b\x0a\x7c\x75\ + \x13\xa1\x82\x46\x27\x34\xe9\x52\x61\x0d\xec\x10\ + \x54\x49\x6e\x54\xdf\x7b\xe1\x31\x8c\x06\x21\x83\ + \x0f\xb5\x1f\x4c\x6a\x71\x52\x42\x74\xe7\x7b\x50\ + \x59\x1f\x6a\xd4\xff\x06\x92\x33\xc4\x34\x97\xff\ + \xcc\xb5\xc4\x00\x7b\xc3\x4a\x7f\x7e\x63\x96\x58\ + \x51\x63\x21\x54\x53\x2f\x03\x8a\x7d\x41\x79\x98\ + \x5b\xcb\xb8\x94\x6b\x73\xf3\x0c\x5a\xd7\xc4\x12\ + \x7a\x2b\x9a\x2e\x67\x62\x2a\x00\x45\x2c\xfe\x7d\ + \x8d\xd6\x51\x4e\x59\x36\x72\x1b\xae\xaa\x06\xe8\ + \x71\x1b\x85\xd3\x35\xb5\xbe\x9e\x16\x96\x72\xd8\ + \x1a\x48\xba\x4d\x55\x4f\x1b\xa2\x77\xfa\x8f\x71\ + \x58\x7d\x03\x93\xa2\x3a\x76\x51\xda\x48\xe2\x3f\ + \xeb\x8d\xda\x89\xae\xf7\xbd\x3d\xb6\x37\x97\xca\ + \x99\xcc\x4a\x8d\x62\x89\x97\xe3\xc0\xd1\x8d\xc1\ + \x26\x11\xbb\x8d\x53\x61\x4f\x76\x03\x00\x30\xd3\ + \x5f\x86\x19\x52\x9c\x3e\x99\x8c\xb7\x21\x48\x1c\ + \x85\xae\xad\xd5\x74\x00\x6c\x3e\xd0\x17\xff\x76\ + \x5c\x32\xc3\xfb\x24\x99\xd4\x4c\xa4\x1f\x66\x46\ + \xe7\x2d\x44\x56\x7d\x14\xd9\x76\x91\x37\x2f\xb7\ + \xcc\x1b\xd3\xc2"; + + #[test] + fn classify_whole() { + assert_eq!(JUNK.len(), 256); + + for &c in &[ + '\0', + '\x01', + 'o', + 'z', + 'ő', + '\u{2764}', + '\u{a66e}', + '\u{1f4a9}', + '\u{1f685}', + ] { + for idx in 0..JUNK.len() - 3 { + let mut buf = JUNK.to_owned(); + let ch = format!("{}", c).into_bytes(); + (&mut buf[idx..]).write_all(&ch).unwrap(); + + for j in 0..ch.len() { + let class = classify(&buf, idx + j).unwrap(); + assert_eq!(class.bytes, &*ch); + assert_eq!(class.rewind, j); + assert_eq!(class.meaning, Meaning::Whole(c)); + } + } + } + } + + #[test] + fn classify_surrogates() { + for &(s, b) in &[ + (Meaning::LeadSurrogate(0x0000), b"\xED\xA0\x80"), + (Meaning::LeadSurrogate(0x0001), b"\xED\xA0\x81"), + (Meaning::LeadSurrogate(0x03FE), b"\xED\xAF\xBE"), + (Meaning::LeadSurrogate(0x03FF), b"\xED\xAF\xBF"), + (Meaning::TrailSurrogate(0x0000), b"\xED\xB0\x80"), + (Meaning::TrailSurrogate(0x0001), b"\xED\xB0\x81"), + (Meaning::TrailSurrogate(0x03FE), b"\xED\xBF\xBE"), + (Meaning::TrailSurrogate(0x03FF), b"\xED\xBF\xBF"), + ] { + for idx in 0..JUNK.len() - 2 { + let mut buf = JUNK.to_owned(); + (&mut buf[idx..]).write_all(b).unwrap(); + + let class = classify(&buf, idx).unwrap(); + assert_eq!(class.bytes, b); + assert_eq!(class.rewind, 0); + assert_eq!(class.meaning, s); + } + } + } + + #[test] + fn classify_prefix_suffix() { + for &c in &['ő', '\u{a66e}', '\u{1f4a9}'] { + let ch = format!("{}", c).into_bytes(); + for pfx in 1..ch.len() - 1 { + let mut buf = JUNK.to_owned(); + let buflen = buf.len(); + (&mut buf[buflen - pfx..buflen]) + .write_all(&ch[..pfx]) + .unwrap(); + for j in 0..pfx { + let idx = buflen - 1 - j; + let class = classify(&buf, idx).unwrap(); + assert_eq!(class.bytes, &ch[..pfx]); + assert_eq!(class.rewind, pfx - 1 - j); + assert_eq!(class.meaning, Meaning::Prefix(ch.len() - pfx)); + } + } + for sfx in 1..ch.len() - 1 { + let ch_bytes = &ch[ch.len() - sfx..]; + let mut buf = JUNK.to_owned(); + (&mut *buf).write_all(ch_bytes).unwrap(); + for j in 0..sfx { + let class = classify(&buf, j).unwrap(); + assert!(ch_bytes.starts_with(class.bytes)); + assert_eq!(class.rewind, j); + assert_eq!(class.meaning, Meaning::Suffix); + } + } + } + } + + #[test] + fn out_of_bounds() { + assert!(classify(b"", 0).is_none()); + assert!(classify(b"", 7).is_none()); + assert!(classify(b"aaaaaaa", 7).is_none()); + } + + #[test] + fn malformed() { + assert_eq!(None, classify(b"\xFF", 0)); + assert_eq!(None, classify(b"\xC5\xC5", 0)); + assert_eq!(None, classify(b"x\x91", 1)); + assert_eq!(None, classify(b"\x91\x91\x91\x91", 3)); + assert_eq!(None, classify(b"\x91\x91\x91\x91\x91", 4)); + assert_eq!(None, classify(b"\xEA\x91\xFF", 1)); + assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 0)); + assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 1)); + assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 2)); + + for i in 0..4 { + // out of range: U+110000 + assert_eq!(None, classify(b"\xF4\x90\x80\x80", i)); + + // out of range: U+1FFFFF + assert_eq!(None, classify(b"\xF7\xBF\xBF\xBF", i)); + + // Overlong sequences + assert_eq!(None, classify(b"\xC0\x80", i)); + assert_eq!(None, classify(b"\xC1\xBF", i)); + assert_eq!(None, classify(b"\xE0\x80\x80", i)); + assert_eq!(None, classify(b"\xE0\x9F\xBF", i)); + assert_eq!(None, classify(b"\xF0\x80\x80\x80", i)); + assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i)); + } + } + + static TEXT: &'static str = " + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act + towards one another in a spirit of brotherhood. + + Minden emberi lény szabadon születik és egyenlő méltósága és + joga van. Az emberek, ésszel és lelkiismerettel bírván, + egymással szemben testvéri szellemben kell hogy viseltessenek. + + เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง + เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน. + + 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 + 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 + 형제애의 정신으로 행동하여야 한다. + + ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a + .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei + jeseki'ubo ry. simyzu'e ta'i le tunba + + ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ + ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ."; + + // random + static IXES: &'static [usize] = &[ + 778, 156, 87, 604, 1216, 365, 884, 311, 469, 515, 709, 162, 871, 206, 634, 442, + ]; + + static BOUNDARY: &'static [bool] = &[ + false, true, true, false, false, true, true, true, true, false, false, true, true, true, + false, false, + ]; + + #[bench] + fn std_utf8_check(b: &mut Bencher) { + b.iter(|| { + assert!(IXES + .iter() + .zip(BOUNDARY.iter()) + .all(|(&ix, &expect)| { expect == TEXT.is_char_boundary(ix) })); + }); + } + + // We don't expect to be as fast as is_char_boundary, because we provide more + // information. But we shouldn't be tremendously slower, either. A factor of + // 5-10 is expected on this text. + #[bench] + fn futf_check(b: &mut Bencher) { + b.iter(|| { + assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| { + expect == (classify(TEXT.as_bytes(), ix).unwrap().rewind == 0) + })); + }); + } +} diff --git a/tendril/src/lib.rs b/tendril/src/lib.rs new file mode 100644 index 00000000..fadc2cab --- /dev/null +++ b/tendril/src/lib.rs @@ -0,0 +1,37 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(all(test, feature = "bench"), feature(test))] +//#![cfg_attr(test, deny(warnings))] + +#[macro_use] +extern crate debug_unreachable; +#[cfg(feature = "encoding")] +pub extern crate encoding; +#[cfg(feature = "encoding_rs")] +pub extern crate encoding_rs; +#[cfg(all(test, feature = "bench"))] +extern crate test; +#[macro_use] +extern crate mac; +extern crate utf8; + +pub use fmt::Format; +pub use stream::TendrilSink; +pub use tendril::{Atomic, Atomicity, NonAtomic, SendTendril}; +pub use tendril::{ByteTendril, ReadExt, SliceExt, StrTendril, SubtendrilError, Tendril}; +pub use utf8_decode::IncompleteUtf8; + +pub mod fmt; +pub mod stream; + +mod buf32; +mod futf; +mod tendril; +mod utf8_decode; +mod util; + +static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; diff --git a/tendril/src/stream.rs b/tendril/src/stream.rs new file mode 100644 index 00000000..469d58c9 --- /dev/null +++ b/tendril/src/stream.rs @@ -0,0 +1,752 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Streams of tendrils. + +use fmt; +use tendril::{Atomicity, NonAtomic, Tendril}; + +use std::borrow::Cow; +use std::fs::File; +use std::io; +use std::marker::PhantomData; +use std::path::Path; + +#[cfg(feature = "encoding")] +use encoding; +#[cfg(feature = "encoding_rs")] +use encoding_rs::{self, DecoderResult}; +use utf8; + +/// Trait for types that can process a tendril. +/// +/// This is a "push" interface, unlike the "pull" interface of +/// `Iterator>`. The push interface matches +/// [html5ever][] and other incremental parsers with a similar +/// architecture. +/// +/// [html5ever]: https://github.com/servo/html5ever +pub trait TendrilSink +where + F: fmt::Format, + A: Atomicity, +{ + /// Process this tendril. + fn process(&mut self, t: Tendril); + + /// Indicates that an error has occurred. + fn error(&mut self, desc: Cow<'static, str>); + + /// What the overall result of processing is. + type Output; + + /// Indicates the end of the stream. + fn finish(self) -> Self::Output; + + /// Process one tendril and finish. + fn one(mut self, t: T) -> Self::Output + where + Self: Sized, + T: Into>, + { + self.process(t.into()); + self.finish() + } + + /// Consume an iterator of tendrils, processing each item, then finish. + fn from_iter(mut self, i: I) -> Self::Output + where + Self: Sized, + I: IntoIterator, + I::Item: Into>, + { + for t in i { + self.process(t.into()) + } + self.finish() + } + + /// Read from the given stream of bytes until exhaustion and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn read_from(mut self, r: &mut R) -> io::Result + where + Self: Sized, + R: io::Read, + F: fmt::SliceFormat, + { + const BUFFER_SIZE: u32 = 4 * 1024; + loop { + let mut tendril = Tendril::::new(); + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec` for that reason. + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril) { + Ok(0) => return Ok(self.finish()), + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + self.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + } + } + + /// Read from the file at the given path and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn from_file

(self, path: P) -> io::Result + where + Self: Sized, + P: AsRef, + F: fmt::SliceFormat, + { + self.read_from(&mut File::open(path)?) + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This does not allocate memory: the output is either subtendrils on the input, +/// on inline tendrils for a single code point. +pub struct Utf8LossyDecoder +where + Sink: TendrilSink, + A: Atomicity, +{ + pub inner_sink: Sink, + incomplete: Option, + marker: PhantomData, +} + +impl Utf8LossyDecoder +where + Sink: TendrilSink, + A: Atomicity, +{ + /// Create a new incremental UTF-8 decoder. + #[inline] + pub fn new(inner_sink: Sink) -> Self { + Utf8LossyDecoder { + inner_sink: inner_sink, + incomplete: None, + marker: PhantomData, + } + } +} + +impl TendrilSink for Utf8LossyDecoder +where + Sink: TendrilSink, + A: Atomicity, +{ + #[inline] + fn process(&mut self, mut t: Tendril) { + // FIXME: remove take() and map() when non-lexical borrows are stable. + if let Some(mut incomplete) = self.incomplete.take() { + let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { + match result { + Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), + Err(_) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + } + t.len() - rest.len() + }); + match resume_at { + None => { + self.incomplete = Some(incomplete); + return; + } + Some(resume_at) => t.pop_front(resume_at as u32), + } + } + while !t.is_empty() { + let unborrowed_result = match utf8::decode(&t) { + Ok(s) => { + debug_assert!(s.as_ptr() == t.as_ptr()); + debug_assert!(s.len() == t.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } + return; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = t.subtendril(0, valid_len as u32); + unsafe { + self.inner_sink + .process(subtendril.reinterpret_without_validating()) + } + } + match and_then { + Ok(incomplete) => { + self.incomplete = Some(incomplete); + return; + } + Err(offset) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + t.pop_front(offset as u32); + } + } + } + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + self.inner_sink.error(desc); + } + + type Output = Sink::Output; + + #[inline] + fn finish(mut self) -> Sink::Output { + if self.incomplete.is_some() { + self.inner_sink + .error("incomplete byte sequence at end of stream".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + self.inner_sink.finish() + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This allocates new tendrils for encodings other than UTF-8. +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +pub struct LossyDecoder +where + Sink: TendrilSink, + A: Atomicity, +{ + inner: LossyDecoderInner, +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +enum LossyDecoderInner +where + Sink: TendrilSink, + A: Atomicity, +{ + Utf8(Utf8LossyDecoder), + #[cfg(feature = "encoding")] + Encoding(Box, Sink), + #[cfg(feature = "encoding_rs")] + EncodingRs(encoding_rs::Decoder, Sink), +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl LossyDecoder +where + Sink: TendrilSink, + A: Atomicity, +{ + /// Create a new incremental decoder using the encoding crate. + #[cfg(feature = "encoding")] + #[inline] + pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { + if encoding.name() == "utf-8" { + LossyDecoder::utf8(sink) + } else { + LossyDecoder { + inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), + } + } + } + + /// Create a new incremental decoder using the encoding_rs crate. + #[cfg(feature = "encoding_rs")] + #[inline] + pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { + if encoding == encoding_rs::UTF_8 { + return Self::utf8(sink); + } + Self { + inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), + } + } + + /// Create a new incremental decoder for the UTF-8 encoding. + /// + /// This is useful for content that is known at run-time to be UTF-8 + /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) + #[inline] + pub fn utf8(sink: Sink) -> LossyDecoder { + LossyDecoder { + inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), + } + } + + /// Give a reference to the inner sink. + pub fn inner_sink(&self) -> &Sink { + match self.inner { + LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, + } + } + + /// Give a mutable reference to the inner sink. + pub fn inner_sink_mut(&mut self) -> &mut Sink { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, + } + } +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl TendrilSink for LossyDecoder +where + Sink: TendrilSink, + A: Atomicity, +{ + #[inline] + fn process(&mut self, t: Tendril) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { + let mut out = Tendril::new(); + let mut t = t; + loop { + match decoder.raw_feed(&*t, &mut out) { + (_, Some(err)) => { + out.push_char('\u{fffd}'); + sink.error(err.cause); + debug_assert!(err.upto >= 0); + t.pop_front(err.upto as u32); + // continue loop and process remainder of t + } + (_, None) => break, + } + } + if out.len() > 0 { + sink.process(out); + } + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { + if t.is_empty() { + return; + } + decode_to_sink(t, decoder, sink, false); + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), + } + } + + type Output = Sink::Output; + + #[inline] + fn finish(self) -> Sink::Output { + match self.inner { + LossyDecoderInner::Utf8(utf8) => return utf8.finish(), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(mut decoder, mut sink) => { + let mut out = Tendril::new(); + if let Some(err) = decoder.raw_finish(&mut out) { + out.push_char('\u{fffd}'); + sink.error(err.cause); + } + if out.len() > 0 { + sink.process(out); + } + sink.finish() + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { + decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); + sink.finish() + } + } + } +} + +#[cfg(feature = "encoding_rs")] +fn decode_to_sink( + mut t: Tendril, + decoder: &mut encoding_rs::Decoder, + sink: &mut Sink, + last: bool, +) where + Sink: TendrilSink, + A: Atomicity, +{ + loop { + let mut out = >::new(); + let max_len = decoder + .max_utf8_buffer_length_without_replacement(t.len()) + .unwrap_or(8192); + unsafe { + out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); + } + let (result, bytes_read, bytes_written) = + decoder.decode_to_utf8_without_replacement(&t, &mut out, last); + if bytes_written > 0 { + sink.process(unsafe { + out.subtendril(0, bytes_written as u32) + .reinterpret_without_validating() + }); + } + match result { + DecoderResult::InputEmpty => return, + DecoderResult::OutputFull => {} + DecoderResult::Malformed(_, _) => { + sink.error(Cow::Borrowed("invalid sequence")); + sink.process("\u{FFFD}".into()); + } + } + t.pop_front(bytes_read as u32); + if t.is_empty() { + return; + } + } +} + +#[cfg(test)] +mod test { + use super::{TendrilSink, Utf8LossyDecoder}; + use fmt; + use std::borrow::Cow; + use tendril::{Atomicity, NonAtomic, Tendril}; + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use super::LossyDecoder; + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use tendril::SliceExt; + + #[cfg(feature = "encoding")] + use encoding::all as enc; + #[cfg(feature = "encoding_rs")] + use encoding_rs as enc_rs; + + struct Accumulate + where + A: Atomicity, + { + tendrils: Vec>, + errors: Vec, + } + + impl Accumulate + where + A: Atomicity, + { + fn new() -> Accumulate { + Accumulate { + tendrils: vec![], + errors: vec![], + } + } + } + + impl TendrilSink for Accumulate + where + A: Atomicity, + { + fn process(&mut self, t: Tendril) { + self.tendrils.push(t); + } + + fn error(&mut self, desc: Cow<'static, str>) { + self.errors.push(desc.into_owned()); + } + + type Output = (Vec>, Vec); + + fn finish(self) -> Self::Output { + (self.tendrils, self.errors) + } + } + + fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { + let decoder = Utf8LossyDecoder::new(Accumulate::::new()); + let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); + assert_eq!( + expected, + &*tendrils.iter().map(|t| &**t).collect::>() + ); + assert_eq!(errs, errors.len()); + } + + #[test] + fn utf8() { + check_utf8(&[], &[], 0); + check_utf8(&[b""], &[], 0); + check_utf8(&[b"xyz"], &["xyz"], 0); + check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); + + check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); + check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8( + &[b"xy\xEA", b"\x99", b"\xAEzw"], + &["xy", "\u{a66e}z", "w"], + 0, + ); + check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); + check_utf8( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + &["\u{a66e}"], + 0, + ); + + check_utf8( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], + 4, + ); + check_utf8( + &[b"xy\xEA\x99", b"\xFFz"], + &["xy", "\u{fffd}", "\u{fffd}", "z"], + 2, + ); + + check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); + check_utf8( + &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], + &["ő", "\u{fffd}", "\u{fffd}", "ő"], + 2, + ); + + // incomplete char at end of input + check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); + check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + fn check_decode( + mut decoder: LossyDecoder>, + input: &[&[u8]], + expected: &str, + errs: usize, + ) { + for x in input { + decoder.process(x.to_tendril()); + } + let (tendrils, errors) = decoder.finish(); + let mut tendril: Tendril = Tendril::new(); + for t in tendrils { + tendril.push_tendril(&t); + } + assert_eq!(expected, &*tendril); + assert_eq!(errs, errors.len()); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; + + #[cfg(any(feature = "encoding"))] + const ASCII: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"xy", b"", b"", b"z"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xFF"], "\u{fffd}", 1), + (&[b"x\xC0yz"], "x\u{fffd}yz", 1), + (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), + (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_ascii() { + for &(input, expected, errs) in ASCII { + let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const UTF_8: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), + ( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + "\u{a66e}", + 0, + ), + (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), + ( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", + 4, + ), + (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), + // incomplete char at end of input + (&[b"\xC0"], "\u{fffd}", 1), + (&[b"\xEA\x99"], "\u{fffd}", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_utf8() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_utf8_encoding_rs() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const KOI8_U: Tests = &[ + (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), + ( + &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], + "Энергия", + 0, + ), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_koi8_u() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_koi8_u_encoding_rs() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const WINDOWS_949: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), + ( + &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], + "안녕하세요", + 0, + ), + (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), + (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), + (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_windows_949() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_windows_949_encoding_rs() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[test] + fn read_from() { + let decoder = Utf8LossyDecoder::new(Accumulate::::new()); + let mut bytes: &[u8] = b"foo\xffbar"; + let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); + assert_eq!( + &*tendrils.iter().map(|t| &**t).collect::>(), + &["foo", "\u{FFFD}", "bar"] + ); + assert_eq!(errors, &["invalid byte sequence"]); + } +} diff --git a/tendril/src/tendril.rs b/tendril/src/tendril.rs new file mode 100644 index 00000000..0a33d827 --- /dev/null +++ b/tendril/src/tendril.rs @@ -0,0 +1,2473 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::Borrow; +use std::cell::{Cell, UnsafeCell}; +use std::cmp::Ordering; +use std::default::Default; +use std::fmt as strfmt; +use std::iter::FromIterator; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::atomic::{self, AtomicUsize}; +use std::{hash, io, mem, ptr, str, u32}; + +#[cfg(feature = "encoding")] +use encoding::{self, DecoderTrap, EncoderTrap, EncodingRef}; + +use buf32::{self, Buf32}; +use fmt::imp::Fixup; +use fmt::{self, Slice}; +use util::{copy_and_advance, copy_lifetime, copy_lifetime_mut, unsafe_slice, unsafe_slice_mut}; +use OFLOW; + +const MAX_INLINE_LEN: usize = 8; +const MAX_INLINE_TAG: usize = 0xF; +const EMPTY_TAG: usize = 0xF; + +#[inline(always)] +fn inline_tag(len: u32) -> NonZeroUsize { + debug_assert!(len <= MAX_INLINE_LEN as u32); + unsafe { NonZeroUsize::new_unchecked(if len == 0 { EMPTY_TAG } else { len as usize }) } +} + +/// The multithreadedness of a tendril. +/// +/// Exactly two types implement this trait: +/// +/// - `Atomic`: use this in your tendril and you will have a `Send` tendril which works +/// across threads; this is akin to `Arc`. +/// +/// - `NonAtomic`: use this in your tendril and you will have a tendril which is neither +/// `Send` nor `Sync` but should be a tad faster; this is akin to `Rc`. +/// +/// The layout of this trait is also mandated to be that of a `usize`, +/// for it is used for reference counting. +pub unsafe trait Atomicity: 'static { + #[doc(hidden)] + fn new() -> Self; + + #[doc(hidden)] + fn increment(&self) -> usize; + + #[doc(hidden)] + fn decrement(&self) -> usize; + + #[doc(hidden)] + fn fence_acquire(); +} + +/// A marker of a non-atomic tendril. +/// +/// This is the default for the second type parameter of a `Tendril` +/// and so doesn't typically need to be written. +/// +/// This is akin to using `Rc` for reference counting. +#[repr(C)] +pub struct NonAtomic(Cell); + +unsafe impl Atomicity for NonAtomic { + #[inline] + fn new() -> Self { + NonAtomic(Cell::new(1)) + } + + #[inline] + fn increment(&self) -> usize { + let value = self.0.get(); + self.0.set(value.checked_add(1).expect(OFLOW)); + value + } + + #[inline] + fn decrement(&self) -> usize { + let value = self.0.get(); + self.0.set(value - 1); + value + } + + #[inline] + fn fence_acquire() {} +} + +/// A marker of an atomic (and hence concurrent) tendril. +/// +/// This is used as the second, optional type parameter of a `Tendril`; +/// `Tendril` thus implements`Send`. +/// +/// This is akin to using `Arc` for reference counting. +pub struct Atomic(AtomicUsize); + +unsafe impl Atomicity for Atomic { + #[inline] + fn new() -> Self { + Atomic(AtomicUsize::new(1)) + } + + #[inline] + fn increment(&self) -> usize { + // Relaxed is OK because we have a reference already. + self.0.fetch_add(1, AtomicOrdering::Relaxed) + } + + #[inline] + fn decrement(&self) -> usize { + self.0.fetch_sub(1, AtomicOrdering::Release) + } + + #[inline] + fn fence_acquire() { + atomic::fence(AtomicOrdering::Acquire); + } +} + +#[repr(C)] // Preserve field order for cross-atomicity transmutes +struct Header { + refcount: A, + cap: u32, +} + +impl Header +where + A: Atomicity, +{ + #[inline(always)] + unsafe fn new() -> Header { + Header { + refcount: A::new(), + cap: 0, + } + } +} + +/// Errors that can occur when slicing a `Tendril`. +#[derive(Copy, Clone, Hash, Debug, PartialEq, Eq)] +pub enum SubtendrilError { + OutOfBounds, + ValidationFailed, +} + +/// Compact string type for zero-copy parsing. +/// +/// `Tendril`s have the semantics of owned strings, but are sometimes views +/// into shared buffers. When you mutate a `Tendril`, an owned copy is made +/// if necessary. Further mutations occur in-place until the string becomes +/// shared, e.g. with `clone()` or `subtendril()`. +/// +/// Buffer sharing is accomplished through thread-local (non-atomic) reference +/// counting, which has very low overhead. The Rust type system will prevent +/// you at compile time from sending a `Tendril` between threads. We plan to +/// relax this restriction in the future; see `README.md`. +/// +/// Whereas `String` allocates in the heap for any non-empty string, `Tendril` +/// can store small strings (up to 8 bytes) in-line, without a heap allocation. +/// `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes +/// versus 24. +/// +/// The type parameter `F` specifies the format of the tendril, for example +/// UTF-8 text or uninterpreted bytes. The parameter will be instantiated +/// with one of the marker types from `tendril::fmt`. See the `StrTendril` +/// and `ByteTendril` type aliases for two examples. +/// +/// The type parameter `A` indicates the atomicity of the tendril; it is by +/// default `NonAtomic`, but can be specified as `Atomic` to get a tendril +/// which implements `Send` (viz. a thread-safe tendril). +/// +/// The maximum length of a `Tendril` is 4 GB. The library will panic if +/// you attempt to go over the limit. +#[repr(C)] +pub struct Tendril +where + F: fmt::Format, + A: Atomicity, +{ + ptr: Cell, + buf: UnsafeCell, + marker: PhantomData<*mut F>, + refcount_marker: PhantomData, +} + +#[repr(C)] +union Buffer { + heap: Heap, + inline: [u8; 8], +} + +#[derive(Copy, Clone)] +#[repr(C)] +struct Heap { + len: u32, + aux: u32, +} + +unsafe impl Send for Tendril +where + F: fmt::Format, + A: Atomicity + Sync, +{ +} + +/// `Tendril` for storing native Rust strings. +pub type StrTendril = Tendril; + +/// `Tendril` for storing binary data. +pub type ByteTendril = Tendril; + +impl Clone for Tendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn clone(&self) -> Tendril { + unsafe { + if self.ptr.get().get() > MAX_INLINE_TAG { + self.make_buf_shared(); + self.incref(); + } + + ptr::read(self) + } + } +} + +impl Drop for Tendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn drop(&mut self) { + unsafe { + let p = self.ptr.get().get(); + if p <= MAX_INLINE_TAG { + return; + } + + let (buf, shared, _) = self.assume_buf(); + if shared { + let header = self.header(); + if (*header).refcount.decrement() == 1 { + A::fence_acquire(); + buf.destroy(); + } + } else { + buf.destroy(); + } + } + } +} + +macro_rules! from_iter_method { + ($ty:ty) => { + #[inline] + fn from_iter(iterable: I) -> Self + where + I: IntoIterator, + { + let mut output = Self::new(); + output.extend(iterable); + output + } + }; +} + +impl Extend for Tendril +where + A: Atomicity, +{ + #[inline] + fn extend(&mut self, iterable: I) + where + I: IntoIterator, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for c in iterator { + self.push_char(c); + } + } +} + +impl FromIterator for Tendril +where + A: Atomicity, +{ + from_iter_method!(char); +} + +impl Extend for Tendril +where + A: Atomicity, +{ + #[inline] + fn extend(&mut self, iterable: I) + where + I: IntoIterator, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for b in iterator { + self.push_slice(&[b]); + } + } +} + +impl FromIterator for Tendril +where + A: Atomicity, +{ + from_iter_method!(u8); +} + +impl<'a, A> Extend<&'a u8> for Tendril +where + A: Atomicity, +{ + #[inline] + fn extend(&mut self, iterable: I) + where + I: IntoIterator, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for &b in iterator { + self.push_slice(&[b]); + } + } +} + +impl<'a, A> FromIterator<&'a u8> for Tendril +where + A: Atomicity, +{ + from_iter_method!(&'a u8); +} + +impl<'a, A> Extend<&'a str> for Tendril +where + A: Atomicity, +{ + #[inline] + fn extend(&mut self, iterable: I) + where + I: IntoIterator, + { + for s in iterable { + self.push_slice(s); + } + } +} + +impl<'a, A> FromIterator<&'a str> for Tendril +where + A: Atomicity, +{ + from_iter_method!(&'a str); +} + +impl<'a, A> Extend<&'a [u8]> for Tendril +where + A: Atomicity, +{ + #[inline] + fn extend(&mut self, iterable: I) + where + I: IntoIterator, + { + for s in iterable { + self.push_slice(s); + } + } +} + +impl<'a, A> FromIterator<&'a [u8]> for Tendril +where + A: Atomicity, +{ + from_iter_method!(&'a [u8]); +} + +impl<'a, F, A> Extend<&'a Tendril> for Tendril +where + F: fmt::Format + 'a, + A: Atomicity, +{ + #[inline] + fn extend(&mut self, iterable: I) + where + I: IntoIterator>, + { + for t in iterable { + self.push_tendril(t); + } + } +} + +impl<'a, F, A> FromIterator<&'a Tendril> for Tendril +where + F: fmt::Format + 'a, + A: Atomicity, +{ + from_iter_method!(&'a Tendril); +} + +impl Deref for Tendril +where + F: fmt::SliceFormat, + A: Atomicity, +{ + type Target = F::Slice; + + #[inline] + fn deref(&self) -> &F::Slice { + unsafe { F::Slice::from_bytes(self.as_byte_slice()) } + } +} + +impl DerefMut for Tendril +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn deref_mut(&mut self) -> &mut F::Slice { + unsafe { F::Slice::from_mut_bytes(self.as_mut_byte_slice()) } + } +} + +impl Borrow<[u8]> for Tendril +where + F: fmt::SliceFormat, + A: Atomicity, +{ + fn borrow(&self) -> &[u8] { + self.as_byte_slice() + } +} + +// Why not impl Borrow for Tendril? str and [u8] hash differently, +// and so a HashMap would silently break if we indexed by str. Ick. +// https://github.com/rust-lang/rust/issues/27108 + +impl PartialEq for Tendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn eq(&self, other: &Self) -> bool { + self.as_byte_slice() == other.as_byte_slice() + } + + #[inline] + fn ne(&self, other: &Self) -> bool { + self.as_byte_slice() != other.as_byte_slice() + } +} + +impl Eq for Tendril +where + F: fmt::Format, + A: Atomicity, +{ +} + +impl PartialOrd for Tendril +where + F: fmt::SliceFormat, + ::Slice: PartialOrd, + A: Atomicity, +{ + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + PartialOrd::partial_cmp(&**self, &**other) + } +} + +impl Ord for Tendril +where + F: fmt::SliceFormat, + ::Slice: Ord, + A: Atomicity, +{ + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&**self, &**other) + } +} + +impl Default for Tendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline(always)] + fn default() -> Tendril { + Tendril::new() + } +} + +impl strfmt::Debug for Tendril +where + F: fmt::SliceFormat + Default + strfmt::Debug, + ::Slice: strfmt::Debug, + A: Atomicity, +{ + #[inline] + fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { + let kind = match self.ptr.get().get() { + p if p <= MAX_INLINE_TAG => "inline", + p if p & 1 == 1 => "shared", + _ => "owned", + }; + + write!(f, "Tendril<{:?}>({}: ", ::default(), kind)?; + <::Slice as strfmt::Debug>::fmt(&**self, f)?; + write!(f, ")") + } +} + +impl hash::Hash for Tendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn hash(&self, hasher: &mut H) { + self.as_byte_slice().hash(hasher) + } +} + +impl Tendril +where + F: fmt::Format, + A: Atomicity, +{ + /// Create a new, empty `Tendril` in any format. + #[inline(always)] + pub fn new() -> Tendril { + unsafe { Tendril::inline(&[]) } + } + + /// Create a new, empty `Tendril` with a specified capacity. + #[inline] + pub fn with_capacity(capacity: u32) -> Tendril { + let mut t: Tendril = Tendril::new(); + if capacity > MAX_INLINE_LEN as u32 { + unsafe { + t.make_owned_with_capacity(capacity); + } + } + t + } + + /// Reserve space for additional bytes. + /// + /// This is only a suggestion. There are cases where `Tendril` will + /// decline to allocate until the buffer is actually modified. + #[inline] + pub fn reserve(&mut self, additional: u32) { + if !self.is_shared() { + // Don't grow a shared tendril because we'd have to copy + // right away. + self.force_reserve(additional); + } + } + + /// Reserve space for additional bytes, even for shared buffers. + #[inline] + fn force_reserve(&mut self, additional: u32) { + let new_len = self.len32().checked_add(additional).expect(OFLOW); + if new_len > MAX_INLINE_LEN as u32 { + unsafe { + self.make_owned_with_capacity(new_len); + } + } + } + + /// Get the length of the `Tendril`. + /// + /// This is named not to conflict with `len()` on the underlying + /// slice, if any. + #[inline(always)] + pub fn len32(&self) -> u32 { + match self.ptr.get().get() { + EMPTY_TAG => 0, + n if n <= MAX_INLINE_LEN => n as u32, + _ => unsafe { self.raw_len() }, + } + } + + /// Is the backing buffer shared? + #[inline] + pub fn is_shared(&self) -> bool { + let n = self.ptr.get().get(); + + (n > MAX_INLINE_TAG) && ((n & 1) == 1) + } + + /// Is the backing buffer shared with this other `Tendril`? + #[inline] + pub fn is_shared_with(&self, other: &Tendril) -> bool { + let n = self.ptr.get().get(); + + (n > MAX_INLINE_TAG) && (n == other.ptr.get().get()) + } + + /// Truncate to length 0 without discarding any owned storage. + #[inline] + pub fn clear(&mut self) { + if self.ptr.get().get() <= MAX_INLINE_TAG { + self.ptr + .set(unsafe { NonZeroUsize::new_unchecked(EMPTY_TAG) }); + } else { + let (_, shared, _) = unsafe { self.assume_buf() }; + if shared { + // No need to keep a reference alive for a 0-size slice. + *self = Tendril::new(); + } else { + unsafe { self.set_len(0) }; + } + } + } + + /// Build a `Tendril` by copying a byte slice, if it conforms to the format. + #[inline] + pub fn try_from_byte_slice(x: &[u8]) -> Result, ()> { + match F::validate(x) { + true => Ok(unsafe { Tendril::from_byte_slice_without_validating(x) }), + false => Err(()), + } + } + + /// View as uninterpreted bytes. + #[inline(always)] + pub fn as_bytes(&self) -> &Tendril { + unsafe { mem::transmute(self) } + } + + /// Convert into uninterpreted bytes. + #[inline(always)] + pub fn into_bytes(self) -> Tendril { + unsafe { mem::transmute(self) } + } + + /// Convert `self` into a type which is `Send`. + /// + /// If the tendril is owned or inline, this is free, + /// but if it's shared this will entail a copy of the contents. + #[inline] + pub fn into_send(mut self) -> SendTendril { + self.make_owned(); + SendTendril { + // This changes the header.refcount from A to NonAtomic, but that's + // OK because we have defined the format of A as a usize. + tendril: unsafe { mem::transmute(self) }, + } + } + + /// View as a superset format, for free. + #[inline(always)] + pub fn as_superset(&self) -> &Tendril + where + F: fmt::SubsetOf, + Super: fmt::Format, + { + unsafe { mem::transmute(self) } + } + + /// Convert into a superset format, for free. + #[inline(always)] + pub fn into_superset(self) -> Tendril + where + F: fmt::SubsetOf, + Super: fmt::Format, + { + unsafe { mem::transmute(self) } + } + + /// View as a subset format, if the `Tendril` conforms to that subset. + #[inline] + pub fn try_as_subset(&self) -> Result<&Tendril, ()> + where + Sub: fmt::SubsetOf, + { + match Sub::revalidate_subset(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(()), + } + } + + /// Convert into a subset format, if the `Tendril` conforms to that subset. + #[inline] + pub fn try_into_subset(self) -> Result, Self> + where + Sub: fmt::SubsetOf, + { + match Sub::revalidate_subset(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(self), + } + } + + /// View as another format, if the bytes of the `Tendril` are valid for + /// that format. + #[inline] + pub fn try_reinterpret_view(&self) -> Result<&Tendril, ()> + where + Other: fmt::Format, + { + match Other::validate(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(()), + } + } + + /// Convert into another format, if the `Tendril` conforms to that format. + /// + /// This only re-validates the existing bytes under the new format. It + /// will *not* change the byte content of the tendril! + /// + /// See the `encode` and `decode` methods for character encoding conversion. + #[inline] + pub fn try_reinterpret(self) -> Result, Self> + where + Other: fmt::Format, + { + match Other::validate(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(self), + } + } + + /// Push some bytes onto the end of the `Tendril`, if they conform to the + /// format. + #[inline] + pub fn try_push_bytes(&mut self, buf: &[u8]) -> Result<(), ()> { + match F::validate(buf) { + true => unsafe { + self.push_bytes_without_validating(buf); + Ok(()) + }, + false => Err(()), + } + } + + /// Push another `Tendril` onto the end of this one. + #[inline] + pub fn push_tendril(&mut self, other: &Tendril) { + let new_len = self.len32().checked_add(other.len32()).expect(OFLOW); + + unsafe { + if (self.ptr.get().get() > MAX_INLINE_TAG) && (other.ptr.get().get() > MAX_INLINE_TAG) { + let (self_buf, self_shared, _) = self.assume_buf(); + let (other_buf, other_shared, _) = other.assume_buf(); + + if self_shared + && other_shared + && (self_buf.data_ptr() == other_buf.data_ptr()) + && other.aux() == self.aux() + self.raw_len() + { + self.set_len(new_len); + return; + } + } + + self.push_bytes_without_validating(other.as_byte_slice()) + } + } + + /// Attempt to slice this `Tendril` as a new `Tendril`. + /// + /// This will share the buffer when possible. Mutating a shared buffer + /// will copy the contents. + /// + /// The offset and length are in bytes. The function will return + /// `Err` if these are out of bounds, or if the resulting slice + /// does not conform to the format. + #[inline] + pub fn try_subtendril( + &self, + offset: u32, + length: u32, + ) -> Result, SubtendrilError> { + let self_len = self.len32(); + if offset > self_len || length > (self_len - offset) { + return Err(SubtendrilError::OutOfBounds); + } + + unsafe { + let byte_slice = unsafe_slice(self.as_byte_slice(), offset as usize, length as usize); + if !F::validate_subseq(byte_slice) { + return Err(SubtendrilError::ValidationFailed); + } + + Ok(self.unsafe_subtendril(offset, length)) + } + } + + /// Slice this `Tendril` as a new `Tendril`. + /// + /// Panics on bounds or validity check failure. + #[inline] + pub fn subtendril(&self, offset: u32, length: u32) -> Tendril { + self.try_subtendril(offset, length).unwrap() + } + + /// Try to drop `n` bytes from the front. + /// + /// Returns `Err` if the bytes are not available, or the suffix fails + /// validation. + #[inline] + pub fn try_pop_front(&mut self, n: u32) -> Result<(), SubtendrilError> { + if n == 0 { + return Ok(()); + } + let old_len = self.len32(); + if n > old_len { + return Err(SubtendrilError::OutOfBounds); + } + let new_len = old_len - n; + + unsafe { + if !F::validate_suffix(unsafe_slice( + self.as_byte_slice(), + n as usize, + new_len as usize, + )) { + return Err(SubtendrilError::ValidationFailed); + } + + self.unsafe_pop_front(n); + Ok(()) + } + } + + /// Drop `n` bytes from the front. + /// + /// Panics if the bytes are not available, or the suffix fails + /// validation. + #[inline] + pub fn pop_front(&mut self, n: u32) { + self.try_pop_front(n).unwrap() + } + + /// Drop `n` bytes from the back. + /// + /// Returns `Err` if the bytes are not available, or the prefix fails + /// validation. + #[inline] + pub fn try_pop_back(&mut self, n: u32) -> Result<(), SubtendrilError> { + if n == 0 { + return Ok(()); + } + let old_len = self.len32(); + if n > old_len { + return Err(SubtendrilError::OutOfBounds); + } + let new_len = old_len - n; + + unsafe { + if !F::validate_prefix(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)) { + return Err(SubtendrilError::ValidationFailed); + } + + self.unsafe_pop_back(n); + Ok(()) + } + } + + /// Drop `n` bytes from the back. + /// + /// Panics if the bytes are not available, or the prefix fails + /// validation. + #[inline] + pub fn pop_back(&mut self, n: u32) { + self.try_pop_back(n).unwrap() + } + + /// View as another format, without validating. + #[inline(always)] + pub unsafe fn reinterpret_view_without_validating(&self) -> &Tendril + where + Other: fmt::Format, + { + mem::transmute(self) + } + + /// Convert into another format, without validating. + #[inline(always)] + pub unsafe fn reinterpret_without_validating(self) -> Tendril + where + Other: fmt::Format, + { + mem::transmute(self) + } + + /// Build a `Tendril` by copying a byte slice, without validating. + #[inline] + pub unsafe fn from_byte_slice_without_validating(x: &[u8]) -> Tendril { + assert!(x.len() <= buf32::MAX_LEN); + if x.len() <= MAX_INLINE_LEN { + Tendril::inline(x) + } else { + Tendril::owned_copy(x) + } + } + + /// Push some bytes onto the end of the `Tendril`, without validating. + #[inline] + pub unsafe fn push_bytes_without_validating(&mut self, buf: &[u8]) { + assert!(buf.len() <= buf32::MAX_LEN); + + let Fixup { + drop_left, + drop_right, + insert_len, + insert_bytes, + } = F::fixup(self.as_byte_slice(), buf); + + // FIXME: think more about overflow + let adj_len = self.len32() + insert_len - drop_left; + + let new_len = adj_len.checked_add(buf.len() as u32).expect(OFLOW) - drop_right; + + let drop_left = drop_left as usize; + let drop_right = drop_right as usize; + + if new_len <= MAX_INLINE_LEN as u32 { + let mut tmp = [0_u8; MAX_INLINE_LEN]; + { + let old = self.as_byte_slice(); + let mut dest = tmp.as_mut_ptr(); + copy_and_advance(&mut dest, unsafe_slice(old, 0, old.len() - drop_left)); + copy_and_advance( + &mut dest, + unsafe_slice(&insert_bytes, 0, insert_len as usize), + ); + copy_and_advance( + &mut dest, + unsafe_slice(buf, drop_right, buf.len() - drop_right), + ); + } + *self = Tendril::inline(&tmp[..new_len as usize]); + } else { + self.make_owned_with_capacity(new_len); + let (owned, _, _) = self.assume_buf(); + let mut dest = owned + .data_ptr() + .offset((owned.len as usize - drop_left) as isize); + copy_and_advance( + &mut dest, + unsafe_slice(&insert_bytes, 0, insert_len as usize), + ); + copy_and_advance( + &mut dest, + unsafe_slice(buf, drop_right, buf.len() - drop_right), + ); + self.set_len(new_len); + } + } + + /// Slice this `Tendril` as a new `Tendril`. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_subtendril(&self, offset: u32, length: u32) -> Tendril { + if length <= MAX_INLINE_LEN as u32 { + Tendril::inline(unsafe_slice( + self.as_byte_slice(), + offset as usize, + length as usize, + )) + } else { + self.make_buf_shared(); + self.incref(); + let (buf, _, _) = self.assume_buf(); + Tendril::shared(buf, self.aux() + offset, length) + } + } + + /// Drop `n` bytes from the front. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_pop_front(&mut self, n: u32) { + let new_len = self.len32() - n; + if new_len <= MAX_INLINE_LEN as u32 { + *self = Tendril::inline(unsafe_slice( + self.as_byte_slice(), + n as usize, + new_len as usize, + )); + } else { + self.make_buf_shared(); + self.set_aux(self.aux() + n); + let len = self.raw_len(); + self.set_len(len - n); + } + } + + /// Drop `n` bytes from the back. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_pop_back(&mut self, n: u32) { + let new_len = self.len32() - n; + if new_len <= MAX_INLINE_LEN as u32 { + *self = Tendril::inline(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)); + } else { + self.make_buf_shared(); + let len = self.raw_len(); + self.set_len(len - n); + } + } + + #[inline] + unsafe fn incref(&self) { + (*self.header()).refcount.increment(); + } + + #[inline] + unsafe fn make_buf_shared(&self) { + let p = self.ptr.get().get(); + if p & 1 == 0 { + let header = p as *mut Header; + (*header).cap = self.aux(); + + self.ptr.set(NonZeroUsize::new_unchecked(p | 1)); + self.set_aux(0); + } + } + + // This is not public as it is of no practical value to users. + // By and large they shouldn't need to worry about the distinction at all, + // and going out of your way to make it owned is pointless. + #[inline] + fn make_owned(&mut self) { + unsafe { + let ptr = self.ptr.get().get(); + if ptr <= MAX_INLINE_TAG || (ptr & 1) == 1 { + *self = Tendril::owned_copy(self.as_byte_slice()); + } + } + } + + #[inline] + unsafe fn make_owned_with_capacity(&mut self, cap: u32) { + self.make_owned(); + let mut buf = self.assume_buf().0; + buf.grow(cap); + self.ptr.set(NonZeroUsize::new_unchecked(buf.ptr as usize)); + self.set_aux(buf.cap); + } + + #[inline(always)] + unsafe fn header(&self) -> *mut Header { + (self.ptr.get().get() & !1) as *mut Header + } + + #[inline] + unsafe fn assume_buf(&self) -> (Buf32>, bool, u32) { + let ptr = self.ptr.get().get(); + let header = self.header(); + let shared = (ptr & 1) == 1; + let (cap, offset) = match shared { + true => ((*header).cap, self.aux()), + false => (self.aux(), 0), + }; + + ( + Buf32 { + ptr: header, + len: offset + self.len32(), + cap: cap, + }, + shared, + offset, + ) + } + + #[inline] + unsafe fn inline(x: &[u8]) -> Tendril { + let len = x.len(); + let t = Tendril { + ptr: Cell::new(inline_tag(len as u32)), + buf: UnsafeCell::new(Buffer { inline: [0; 8] }), + marker: PhantomData, + refcount_marker: PhantomData, + }; + ptr::copy_nonoverlapping(x.as_ptr(), (*t.buf.get()).inline.as_mut_ptr(), len); + t + } + + #[inline] + unsafe fn owned(x: Buf32>) -> Tendril { + Tendril { + ptr: Cell::new(NonZeroUsize::new_unchecked(x.ptr as usize)), + buf: UnsafeCell::new(Buffer { + heap: Heap { + len: x.len, + aux: x.cap, + }, + }), + marker: PhantomData, + refcount_marker: PhantomData, + } + } + + #[inline] + unsafe fn owned_copy(x: &[u8]) -> Tendril { + let len32 = x.len() as u32; + let mut b = Buf32::with_capacity(len32, Header::new()); + ptr::copy_nonoverlapping(x.as_ptr(), b.data_ptr(), x.len()); + b.len = len32; + Tendril::owned(b) + } + + #[inline] + unsafe fn shared(buf: Buf32>, off: u32, len: u32) -> Tendril { + Tendril { + ptr: Cell::new(NonZeroUsize::new_unchecked((buf.ptr as usize) | 1)), + buf: UnsafeCell::new(Buffer { + heap: Heap { len, aux: off }, + }), + marker: PhantomData, + refcount_marker: PhantomData, + } + } + + #[inline] + fn as_byte_slice<'a>(&'a self) -> &'a [u8] { + unsafe { + match self.ptr.get().get() { + EMPTY_TAG => &[], + n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked(..n), + _ => { + let (buf, _, offset) = self.assume_buf(); + copy_lifetime( + self, + unsafe_slice(buf.data(), offset as usize, self.len32() as usize), + ) + } + } + } + } + + // There's no need to worry about locking on an atomic Tendril, because it makes it unique as + // soon as you do that. + #[inline] + fn as_mut_byte_slice<'a>(&'a mut self) -> &'a mut [u8] { + unsafe { + match self.ptr.get().get() { + EMPTY_TAG => &mut [], + n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked_mut(..n), + _ => { + self.make_owned(); + let (mut buf, _, offset) = self.assume_buf(); + let len = self.len32() as usize; + copy_lifetime_mut(self, unsafe_slice_mut(buf.data_mut(), offset as usize, len)) + } + } + } + } + + unsafe fn raw_len(&self) -> u32 { + (*self.buf.get()).heap.len + } + + unsafe fn set_len(&mut self, len: u32) { + (*self.buf.get()).heap.len = len; + } + + unsafe fn aux(&self) -> u32 { + (*self.buf.get()).heap.aux + } + + unsafe fn set_aux(&self, aux: u32) { + (*self.buf.get()).heap.aux = aux; + } +} + +impl Tendril +where + F: fmt::SliceFormat, + A: Atomicity, +{ + /// Build a `Tendril` by copying a slice. + #[inline] + pub fn from_slice(x: &F::Slice) -> Tendril { + unsafe { Tendril::from_byte_slice_without_validating(x.as_bytes()) } + } + + /// Push a slice onto the end of the `Tendril`. + #[inline] + pub fn push_slice(&mut self, x: &F::Slice) { + unsafe { self.push_bytes_without_validating(x.as_bytes()) } + } +} + +/// A simple wrapper to make `Tendril` `Send`. +/// +/// Although there is a certain subset of the operations on a `Tendril` that a `SendTendril` could +/// reasonably implement, in order to clearly separate concerns this type is deliberately +/// minimalist, acting as a safe encapsulation around the invariants which permit `Send`ness and +/// behaving as an opaque object. +/// +/// A `SendTendril` may be produced by `Tendril.into_send()` or `SendTendril::from(tendril)`, +/// and may be returned to a `Tendril` by `Tendril::from(self)`. +#[derive(Clone)] +pub struct SendTendril +where + F: fmt::Format, +{ + tendril: Tendril, +} + +unsafe impl Send for SendTendril where F: fmt::Format {} + +impl From> for SendTendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn from(tendril: Tendril) -> SendTendril { + tendril.into_send() + } +} + +impl From> for Tendril +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn from(send: SendTendril) -> Tendril { + unsafe { mem::transmute(send.tendril) } + // header.refcount may have been initialised as an Atomic or a NonAtomic, but the value + // will be the same (1) regardless, because the layout is defined. + // Thus we don't need to fiddle about resetting it or anything like that. + } +} + +/// `Tendril`-related methods for Rust slices. +pub trait SliceExt: fmt::Slice +where + F: fmt::SliceFormat, +{ + /// Make a `Tendril` from this slice. + #[inline] + fn to_tendril(&self) -> Tendril { + // It should be done thusly, but at the time of writing the defaults don't help inference: + //fn to_tendril(&self) -> Tendril + // where A: Atomicity, + //{ + Tendril::from_slice(self) + } +} + +impl SliceExt for str {} +impl SliceExt for [u8] {} + +impl Tendril +where + F: for<'a> fmt::CharFormat<'a>, + A: Atomicity, +{ + /// Remove and return the first character, if any. + #[inline] + pub fn pop_front_char<'a>(&'a mut self) -> Option { + unsafe { + let next_char; // first char in iterator + let mut skip = 0; // number of bytes to skip, or 0 to clear + + { + // <--+ + // | Creating an iterator borrows self, so introduce a + // +- scope to contain the borrow (that way we can mutate + // self below, after this scope exits). + + let mut iter = F::char_indices(self.as_byte_slice()); + match iter.next() { + Some((_, c)) => { + next_char = Some(c); + if let Some((n, _)) = iter.next() { + skip = n as u32; + } + } + None => { + next_char = None; + } + } + } + + if skip != 0 { + self.unsafe_pop_front(skip); + } else { + self.clear(); + } + + next_char + } + } + + /// Remove and return a run of characters at the front of the `Tendril` + /// which are classified the same according to the function `classify`. + /// + /// Returns `None` on an empty string. + #[inline] + pub fn pop_front_char_run<'a, C, R>(&'a mut self, mut classify: C) -> Option<(Tendril, R)> + where + C: FnMut(char) -> R, + R: PartialEq, + { + let (class, first_mismatch); + { + let mut chars = unsafe { F::char_indices(self.as_byte_slice()) }; + let (_, first) = unwrap_or_return!(chars.next(), None); + class = classify(first); + first_mismatch = chars.find(|&(_, ch)| &classify(ch) != &class); + } + + match first_mismatch { + Some((idx, _)) => unsafe { + let t = self.unsafe_subtendril(0, idx as u32); + self.unsafe_pop_front(idx as u32); + Some((t, class)) + }, + None => { + let t = self.clone(); + self.clear(); + Some((t, class)) + } + } + } + + /// Push a character, if it can be represented in this format. + #[inline] + pub fn try_push_char(&mut self, c: char) -> Result<(), ()> { + F::encode_char(c, |b| unsafe { + self.push_bytes_without_validating(b); + }) + } +} + +/// Extension trait for `io::Read`. +pub trait ReadExt: io::Read { + fn read_to_tendril(&mut self, buf: &mut Tendril) -> io::Result + where + A: Atomicity; +} + +impl ReadExt for T +where + T: io::Read, +{ + /// Read all bytes until EOF. + fn read_to_tendril(&mut self, buf: &mut Tendril) -> io::Result + where + A: Atomicity, + { + // Adapted from libstd/io/mod.rs. + const DEFAULT_BUF_SIZE: u32 = 64 * 1024; + + let start_len = buf.len(); + let mut len = start_len; + let mut new_write_size = 16; + let ret; + loop { + if len == buf.len() { + if new_write_size < DEFAULT_BUF_SIZE { + new_write_size *= 2; + } + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec` for that reason. + unsafe { + buf.push_uninitialized(new_write_size); + } + } + + match self.read(&mut buf[len..]) { + Ok(0) => { + ret = Ok(len - start_len); + break; + } + Ok(n) => len += n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => { + ret = Err(e); + break; + } + } + } + + let buf_len = buf.len32(); + buf.pop_back(buf_len - (len as u32)); + ret + } +} + +impl io::Write for Tendril +where + A: Atomicity, +{ + #[inline] + fn write(&mut self, buf: &[u8]) -> io::Result { + self.push_slice(buf); + Ok(buf.len()) + } + + #[inline] + fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { + self.push_slice(buf); + Ok(()) + } + + #[inline(always)] + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(feature = "encoding")] +impl encoding::ByteWriter for Tendril +where + A: Atomicity, +{ + #[inline] + fn write_byte(&mut self, b: u8) { + self.push_slice(&[b]); + } + + #[inline] + fn write_bytes(&mut self, v: &[u8]) { + self.push_slice(v); + } + + #[inline] + fn writer_hint(&mut self, additional: usize) { + self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); + } +} + +impl Tendril +where + A: Atomicity, + F: fmt::SliceFormat, +{ + /// Decode from some character encoding into UTF-8. + /// + /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) + /// for more information. + #[inline] + #[cfg(feature = "encoding")] + pub fn decode( + &self, + encoding: EncodingRef, + trap: DecoderTrap, + ) -> Result, ::std::borrow::Cow<'static, str>> { + let mut ret = Tendril::new(); + encoding.decode_to(&*self, trap, &mut ret).map(|_| ret) + } + + /// Push "uninitialized bytes" onto the end. + /// + /// Really, this grows the tendril without writing anything to the new area. + /// It's only defined for byte tendrils because it's only useful if you + /// plan to then mutate the buffer. + #[inline] + pub unsafe fn push_uninitialized(&mut self, n: u32) { + let new_len = self.len32().checked_add(n).expect(OFLOW); + if new_len <= MAX_INLINE_LEN as u32 && self.ptr.get().get() <= MAX_INLINE_TAG { + self.ptr.set(inline_tag(new_len)) + } else { + self.make_owned_with_capacity(new_len); + self.set_len(new_len); + } + } +} + +impl strfmt::Display for Tendril +where + A: Atomicity, +{ + #[inline] + fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { + ::fmt(&**self, f) + } +} + +impl str::FromStr for Tendril +where + A: Atomicity, +{ + type Err = (); + + #[inline] + fn from_str(s: &str) -> Result { + Ok(Tendril::from_slice(s)) + } +} + +impl strfmt::Write for Tendril +where + A: Atomicity, +{ + #[inline] + fn write_str(&mut self, s: &str) -> strfmt::Result { + self.push_slice(s); + Ok(()) + } +} + +#[cfg(feature = "encoding")] +impl encoding::StringWriter for Tendril +where + A: Atomicity, +{ + #[inline] + fn write_char(&mut self, c: char) { + self.push_char(c); + } + + #[inline] + fn write_str(&mut self, s: &str) { + self.push_slice(s); + } + + #[inline] + fn writer_hint(&mut self, additional: usize) { + self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); + } +} + +impl Tendril +where + A: Atomicity, +{ + /// Encode from UTF-8 into some other character encoding. + /// + /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) + /// for more information. + #[inline] + #[cfg(feature = "encoding")] + pub fn encode( + &self, + encoding: EncodingRef, + trap: EncoderTrap, + ) -> Result, ::std::borrow::Cow<'static, str>> { + let mut ret = Tendril::new(); + encoding.encode_to(&*self, trap, &mut ret).map(|_| ret) + } + + /// Push a character onto the end. + #[inline] + pub fn push_char(&mut self, c: char) { + unsafe { + self.push_bytes_without_validating(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); + } + } + + /// Create a `Tendril` from a single character. + #[inline] + pub fn from_char(c: char) -> Tendril { + let mut t: Tendril = Tendril::new(); + t.push_char(c); + t + } + + /// Helper for the `format_tendril!` macro. + #[inline] + pub fn format(args: strfmt::Arguments) -> Tendril { + use std::fmt::Write; + let mut output: Tendril = Tendril::new(); + let _ = write!(&mut output, "{}", args); + output + } +} + +/// Create a `StrTendril` through string formatting. +/// +/// Works just like the standard `format!` macro. +#[macro_export] +macro_rules! format_tendril { + ($($arg:tt)*) => ($crate::StrTendril::format(format_args!($($arg)*))) +} + +impl<'a, F, A> From<&'a F::Slice> for Tendril +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn from(input: &F::Slice) -> Tendril { + Tendril::from_slice(input) + } +} + +impl From for Tendril +where + A: Atomicity, +{ + #[inline] + fn from(input: String) -> Tendril { + Tendril::from_slice(&*input) + } +} + +impl AsRef for Tendril +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn as_ref(&self) -> &F::Slice { + &**self + } +} + +impl From> for String +where + A: Atomicity, +{ + #[inline] + fn from(input: Tendril) -> String { + String::from(&*input) + } +} + +impl<'a, A> From<&'a Tendril> for String +where + A: Atomicity, +{ + #[inline] + fn from(input: &'a Tendril) -> String { + String::from(&**input) + } +} + +#[cfg(all(test, feature = "bench"))] +#[path = "bench.rs"] +mod bench; + +#[cfg(test)] +mod test { + use super::{ + Atomic, ByteTendril, Header, NonAtomic, ReadExt, SendTendril, SliceExt, StrTendril, Tendril, + }; + use fmt; + use std::iter; + use std::thread; + + fn assert_send() {} + + #[test] + fn smoke_test() { + assert_eq!("", &*"".to_tendril()); + assert_eq!("abc", &*"abc".to_tendril()); + assert_eq!("Hello, world!", &*"Hello, world!".to_tendril()); + + assert_eq!(b"", &*b"".to_tendril()); + assert_eq!(b"abc", &*b"abc".to_tendril()); + assert_eq!(b"Hello, world!", &*b"Hello, world!".to_tendril()); + } + + #[test] + fn assert_sizes() { + use std::mem; + struct EmptyWithDrop; + impl Drop for EmptyWithDrop { + fn drop(&mut self) {} + } + let compiler_uses_inline_drop_flags = mem::size_of::() > 0; + + let correct = mem::size_of::<*const ()>() + + 8 + + if compiler_uses_inline_drop_flags { + 1 + } else { + 0 + }; + + assert_eq!(correct, mem::size_of::()); + assert_eq!(correct, mem::size_of::()); + + // This is no longer true. See https://github.com/servo/tendril/issues/66 + // assert_eq!(correct, mem::size_of::>()); + // assert_eq!(correct, mem::size_of::>()); + + assert_eq!( + mem::size_of::<*const ()>() * 2, + mem::size_of::>(), + ); + assert_eq!( + mem::size_of::>(), + mem::size_of::>(), + ); + } + + #[test] + fn validate_utf8() { + assert!(ByteTendril::try_from_byte_slice(b"\xFF").is_ok()); + assert!(StrTendril::try_from_byte_slice(b"\xFF").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xFF").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xAE\xEA").is_err()); + assert_eq!( + "\u{a66e}", + &*StrTendril::try_from_byte_slice(b"\xEA\x99\xAE").unwrap() + ); + + let mut t = StrTendril::new(); + assert!(t.try_push_bytes(b"\xEA\x99").is_err()); + assert!(t.try_push_bytes(b"\xAE").is_err()); + assert!(t.try_push_bytes(b"\xEA\x99\xAE").is_ok()); + assert_eq!("\u{a66e}", &*t); + } + + #[test] + fn share_and_unshare() { + let s = b"foobarbaz".to_tendril(); + assert_eq!(b"foobarbaz", &*s); + assert!(!s.is_shared()); + + let mut t = s.clone(); + assert_eq!(s.as_ptr(), t.as_ptr()); + assert!(s.is_shared()); + assert!(t.is_shared()); + + t.push_slice(b"quux"); + assert_eq!(b"foobarbaz", &*s); + assert_eq!(b"foobarbazquux", &*t); + assert!(s.as_ptr() != t.as_ptr()); + assert!(!t.is_shared()); + } + + #[test] + fn format_display() { + assert_eq!("foobar", &*format!("{}", "foobar".to_tendril())); + + let mut s = "foo".to_tendril(); + assert_eq!("foo", &*format!("{}", s)); + + let t = s.clone(); + assert_eq!("foo", &*format!("{}", s)); + assert_eq!("foo", &*format!("{}", t)); + + s.push_slice("barbaz!"); + assert_eq!("foobarbaz!", &*format!("{}", s)); + assert_eq!("foo", &*format!("{}", t)); + } + + #[test] + fn format_debug() { + assert_eq!( + r#"Tendril(inline: "foobar")"#, + &*format!("{:?}", "foobar".to_tendril()) + ); + assert_eq!( + r#"Tendril(inline: [102, 111, 111, 98, 97, 114])"#, + &*format!("{:?}", b"foobar".to_tendril()) + ); + + let t = "anextralongstring".to_tendril(); + assert_eq!( + r#"Tendril(owned: "anextralongstring")"#, + &*format!("{:?}", t) + ); + let _ = t.clone(); + assert_eq!( + r#"Tendril(shared: "anextralongstring")"#, + &*format!("{:?}", t) + ); + } + + #[test] + fn subtendril() { + assert_eq!("foo".to_tendril(), "foo-bar".to_tendril().subtendril(0, 3)); + assert_eq!("bar".to_tendril(), "foo-bar".to_tendril().subtendril(4, 3)); + + let mut t = "foo-bar".to_tendril(); + t.pop_front(2); + assert_eq!("o-bar".to_tendril(), t); + t.pop_back(1); + assert_eq!("o-ba".to_tendril(), t); + + assert_eq!( + "foo".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(0, 3) + ); + assert_eq!( + "oo-a-".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(1, 5) + ); + assert_eq!( + "bar".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(20, 3) + ); + + let mut t = "another rather long string".to_tendril(); + t.pop_front(2); + assert!(t.starts_with("other rather")); + t.pop_back(1); + assert_eq!("other rather long strin".to_tendril(), t); + assert!(t.is_shared()); + } + + #[test] + fn subtendril_invalid() { + assert!("\u{a66e}".to_tendril().try_subtendril(0, 2).is_err()); + assert!("\u{a66e}".to_tendril().try_subtendril(1, 2).is_err()); + + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 3).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 3).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(3, 1).is_err()); + + let mut t = "\u{1f4a9}zzzzzz".to_tendril(); + assert!(t.try_pop_front(1).is_err()); + assert!(t.try_pop_front(2).is_err()); + assert!(t.try_pop_front(3).is_err()); + assert!(t.try_pop_front(4).is_ok()); + assert_eq!("zzzzzz", &*t); + + let mut t = "zzzzzz\u{1f4a9}".to_tendril(); + assert!(t.try_pop_back(1).is_err()); + assert!(t.try_pop_back(2).is_err()); + assert!(t.try_pop_back(3).is_err()); + assert!(t.try_pop_back(4).is_ok()); + assert_eq!("zzzzzz", &*t); + } + + #[test] + fn conversion() { + assert_eq!( + &[0x66, 0x6F, 0x6F].to_tendril(), + "foo".to_tendril().as_bytes() + ); + assert_eq!( + [0x66, 0x6F, 0x6F].to_tendril(), + "foo".to_tendril().into_bytes() + ); + + let ascii: Tendril = b"hello".to_tendril().try_reinterpret().unwrap(); + assert_eq!(&"hello".to_tendril(), ascii.as_superset()); + assert_eq!("hello".to_tendril(), ascii.clone().into_superset()); + + assert!(b"\xFF" + .to_tendril() + .try_reinterpret::() + .is_err()); + + let t = "hello".to_tendril(); + let ascii: &Tendril = t.try_as_subset().unwrap(); + assert_eq!(b"hello", &**ascii.as_bytes()); + + assert!("ő" + .to_tendril() + .try_reinterpret_view::() + .is_err()); + assert!("ő".to_tendril().try_as_subset::().is_err()); + + let ascii: Tendril = "hello".to_tendril().try_into_subset().unwrap(); + assert_eq!(b"hello", &**ascii.as_bytes()); + + assert!("ő".to_tendril().try_reinterpret::().is_err()); + assert!("ő".to_tendril().try_into_subset::().is_err()); + } + + #[test] + fn clear() { + let mut t = "foo-".to_tendril(); + t.clear(); + assert_eq!(t.len(), 0); + assert_eq!(t.len32(), 0); + assert_eq!(&*t, ""); + + let mut t = "much longer".to_tendril(); + let s = t.clone(); + t.clear(); + assert_eq!(t.len(), 0); + assert_eq!(t.len32(), 0); + assert_eq!(&*t, ""); + assert_eq!(&*s, "much longer"); + } + + #[test] + fn push_tendril() { + let mut t = "abc".to_tendril(); + t.push_tendril(&"xyz".to_tendril()); + assert_eq!("abcxyz", &*t); + } + + #[test] + fn wtf8() { + assert!(Tendril::::try_from_byte_slice(b"\xED\xA0\xBD").is_ok()); + assert!(Tendril::::try_from_byte_slice(b"\xED\xB2\xA9").is_ok()); + assert!(Tendril::::try_from_byte_slice(b"\xED\xA0\xBD\xED\xB2\xA9").is_err()); + + let t: Tendril = + Tendril::try_from_byte_slice(b"\xED\xA0\xBD\xEA\x99\xAE").unwrap(); + assert!(b"\xED\xA0\xBD".to_tendril().try_reinterpret().unwrap() == t.subtendril(0, 3)); + assert!(b"\xEA\x99\xAE".to_tendril().try_reinterpret().unwrap() == t.subtendril(3, 3)); + assert!(t.try_reinterpret_view::().is_err()); + + assert!(t.try_subtendril(0, 1).is_err()); + assert!(t.try_subtendril(0, 2).is_err()); + assert!(t.try_subtendril(1, 1).is_err()); + + assert!(t.try_subtendril(3, 1).is_err()); + assert!(t.try_subtendril(3, 2).is_err()); + assert!(t.try_subtendril(4, 1).is_err()); + + // paired surrogates + let mut t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBD").unwrap(); + assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); + assert_eq!(b"\xF0\x9F\x92\xA9", t.as_byte_slice()); + assert!(t.try_reinterpret_view::().is_ok()); + + // unpaired surrogates + let mut t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBB").unwrap(); + assert!(t.try_push_bytes(b"\xED\xA0").is_err()); + assert!(t.try_push_bytes(b"\xED").is_err()); + assert!(t.try_push_bytes(b"\xA0").is_err()); + assert!(t.try_push_bytes(b"\xED\xA0\xBD").is_ok()); + assert_eq!(b"\xED\xA0\xBB\xED\xA0\xBD", t.as_byte_slice()); + assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); + assert_eq!(b"\xED\xA0\xBB\xF0\x9F\x92\xA9", t.as_byte_slice()); + assert!(t.try_reinterpret_view::().is_err()); + } + + #[test] + fn front_char() { + let mut t = "".to_tendril(); + assert_eq!(None, t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = "abc".to_tendril(); + assert_eq!(Some('a'), t.pop_front_char()); + assert_eq!(Some('b'), t.pop_front_char()); + assert_eq!(Some('c'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = "főo-a-longer-string-bar-baz".to_tendril(); + assert_eq!(28, t.len()); + assert_eq!(Some('f'), t.pop_front_char()); + assert_eq!(Some('ő'), t.pop_front_char()); + assert_eq!(Some('o'), t.pop_front_char()); + assert_eq!(Some('-'), t.pop_front_char()); + assert_eq!(23, t.len()); + } + + #[test] + fn char_run() { + for &(s, exp) in &[ + ("", None), + (" ", Some((" ", true))), + ("x", Some(("x", false))), + (" \t \n", Some((" \t \n", true))), + ("xyzzy", Some(("xyzzy", false))), + (" xyzzy", Some((" ", true))), + ("xyzzy ", Some(("xyzzy", false))), + (" xyzzy ", Some((" ", true))), + ("xyzzy hi", Some(("xyzzy", false))), + ("中 ", Some(("中", false))), + (" 中 ", Some((" ", true))), + (" 中 ", Some((" ", true))), + (" 中 ", Some((" ", true))), + ] { + let mut t = s.to_tendril(); + let res = t.pop_front_char_run(char::is_whitespace); + match exp { + None => assert!(res.is_none()), + Some((es, ec)) => { + let (rt, rc) = res.unwrap(); + assert_eq!(es, &*rt); + assert_eq!(ec, rc); + } + } + } + } + + #[test] + fn deref_mut_inline() { + let mut t = "xyő".to_tendril().into_bytes(); + t[3] = 0xff; + assert_eq!(b"xy\xC5\xFF", &*t); + assert!(t.try_reinterpret_view::().is_err()); + t[3] = 0x8b; + assert_eq!("xyŋ", &**t.try_reinterpret_view::().unwrap()); + + unsafe { + t.push_uninitialized(3); + t[4] = 0xEA; + t[5] = 0x99; + t[6] = 0xAE; + assert_eq!( + "xyŋ\u{a66e}", + &**t.try_reinterpret_view::().unwrap() + ); + t.push_uninitialized(20); + t.pop_back(20); + assert_eq!( + "xyŋ\u{a66e}", + &**t.try_reinterpret_view::().unwrap() + ); + } + } + + #[test] + fn deref_mut() { + let mut t = b"0123456789".to_tendril(); + let u = t.clone(); + assert!(t.is_shared()); + t[9] = 0xff; + assert!(!t.is_shared()); + assert_eq!(b"0123456789", &*u); + assert_eq!(b"012345678\xff", &*t); + } + + #[test] + fn push_char() { + let mut t = "xyz".to_tendril(); + t.push_char('o'); + assert_eq!("xyzo", &*t); + t.push_char('ő'); + assert_eq!("xyzoő", &*t); + t.push_char('\u{a66e}'); + assert_eq!("xyzoő\u{a66e}", &*t); + t.push_char('\u{1f4a9}'); + assert_eq!("xyzoő\u{a66e}\u{1f4a9}", &*t); + assert_eq!(t.len(), 13); + } + + #[test] + #[cfg(feature = "encoding")] + fn encode() { + use encoding::{all, EncoderTrap}; + + let t = "안녕하세요 러스트".to_tendril(); + assert_eq!( + b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae", + &*t.encode(all::WINDOWS_949, EncoderTrap::Strict).unwrap() + ); + + let t = "Энергия пробуждения ия-я-я! \u{a66e}".to_tendril(); + assert_eq!( + b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ + \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21 ?", + &*t.encode(all::KOI8_U, EncoderTrap::Replace).unwrap() + ); + + let t = "\u{1f4a9}".to_tendril(); + assert!(t.encode(all::WINDOWS_1252, EncoderTrap::Strict).is_err()); + } + + #[test] + #[cfg(feature = "encoding")] + fn decode() { + use encoding::{all, DecoderTrap}; + + let t = b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\ + \xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae" + .to_tendril(); + assert_eq!( + "안녕하세요 러스트", + &*t.decode(all::WINDOWS_949, DecoderTrap::Strict).unwrap() + ); + + let t = b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ + \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21" + .to_tendril(); + assert_eq!( + "Энергия пробуждения ия-я-я!", + &*t.decode(all::KOI8_U, DecoderTrap::Replace).unwrap() + ); + + let t = b"x \xff y".to_tendril(); + assert!(t.decode(all::UTF_8, DecoderTrap::Strict).is_err()); + + let t = b"x \xff y".to_tendril(); + assert_eq!( + "x \u{fffd} y", + &*t.decode(all::UTF_8, DecoderTrap::Replace).unwrap() + ); + } + + #[test] + fn ascii() { + fn mk(x: &[u8]) -> Tendril { + x.to_tendril().try_reinterpret().unwrap() + } + + let mut t = mk(b"xyz"); + assert_eq!(Some('x'), t.pop_front_char()); + assert_eq!(Some('y'), t.pop_front_char()); + assert_eq!(Some('z'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = mk(b" \t xyz"); + assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); + assert!(Some((mk(b"xyz"), false)) == t.pop_front_char_run(char::is_whitespace)); + assert!(t.pop_front_char_run(char::is_whitespace).is_none()); + + let mut t = Tendril::::new(); + assert!(t.try_push_char('x').is_ok()); + assert!(t.try_push_char('\0').is_ok()); + assert!(t.try_push_char('\u{a0}').is_err()); + assert_eq!(b"x\0", t.as_byte_slice()); + } + + #[test] + fn latin1() { + fn mk(x: &[u8]) -> Tendril { + x.to_tendril().try_reinterpret().unwrap() + } + + let mut t = mk(b"\xd8_\xd8"); + assert_eq!(Some('Ø'), t.pop_front_char()); + assert_eq!(Some('_'), t.pop_front_char()); + assert_eq!(Some('Ø'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = mk(b" \t \xfe\xa7z"); + assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); + assert!(Some((mk(b"\xfe\xa7z"), false)) == t.pop_front_char_run(char::is_whitespace)); + assert!(t.pop_front_char_run(char::is_whitespace).is_none()); + + let mut t = Tendril::::new(); + assert!(t.try_push_char('x').is_ok()); + assert!(t.try_push_char('\0').is_ok()); + assert!(t.try_push_char('\u{a0}').is_ok()); + assert!(t.try_push_char('ő').is_err()); + assert!(t.try_push_char('я').is_err()); + assert!(t.try_push_char('\u{a66e}').is_err()); + assert!(t.try_push_char('\u{1f4a9}').is_err()); + assert_eq!(b"x\0\xa0", t.as_byte_slice()); + } + + #[test] + fn format() { + assert_eq!("", &*format_tendril!("")); + assert_eq!( + "two and two make 4", + &*format_tendril!("two and two make {}", 2 + 2) + ); + } + + #[test] + fn merge_shared() { + let t = "012345678901234567890123456789".to_tendril(); + let a = t.subtendril(10, 20); + assert!(a.is_shared()); + assert_eq!("01234567890123456789", &*a); + let mut b = t.subtendril(0, 10); + assert!(b.is_shared()); + assert_eq!("0123456789", &*b); + + b.push_tendril(&a); + assert!(b.is_shared()); + assert!(a.is_shared()); + assert!(a.is_shared_with(&b)); + assert!(b.is_shared_with(&a)); + assert_eq!("012345678901234567890123456789", &*b); + + assert!(t.is_shared()); + assert!(t.is_shared_with(&a)); + assert!(t.is_shared_with(&b)); + } + + #[test] + fn merge_cant_share() { + let t = "012345678901234567890123456789".to_tendril(); + let mut b = t.subtendril(0, 10); + assert!(b.is_shared()); + assert_eq!("0123456789", &*b); + + b.push_tendril(&"abcd".to_tendril()); + assert!(!b.is_shared()); + assert_eq!("0123456789abcd", &*b); + } + + #[test] + fn shared_doesnt_reserve() { + let mut t = "012345678901234567890123456789".to_tendril(); + let a = t.subtendril(1, 10); + + assert!(t.is_shared()); + t.reserve(10); + assert!(t.is_shared()); + + let _ = a; + } + + #[test] + fn out_of_bounds() { + assert!("".to_tendril().try_subtendril(0, 1).is_err()); + assert!("abc".to_tendril().try_subtendril(0, 4).is_err()); + assert!("abc".to_tendril().try_subtendril(3, 1).is_err()); + assert!("abc".to_tendril().try_subtendril(7, 1).is_err()); + + let mut t = "".to_tendril(); + assert!(t.try_pop_front(1).is_err()); + assert!(t.try_pop_front(5).is_err()); + assert!(t.try_pop_front(500).is_err()); + assert!(t.try_pop_back(1).is_err()); + assert!(t.try_pop_back(5).is_err()); + assert!(t.try_pop_back(500).is_err()); + + let mut t = "abcd".to_tendril(); + assert!(t.try_pop_front(1).is_ok()); + assert!(t.try_pop_front(4).is_err()); + assert!(t.try_pop_front(500).is_err()); + assert!(t.try_pop_back(1).is_ok()); + assert!(t.try_pop_back(3).is_err()); + assert!(t.try_pop_back(500).is_err()); + } + + #[test] + fn compare() { + for &a in &[ + "indiscretions", + "validity", + "hallucinogenics", + "timelessness", + "original", + "microcosms", + "boilers", + "mammoth", + ] { + for &b in &[ + "intrepidly", + "frigid", + "spa", + "cardigans", + "guileful", + "evaporated", + "unenthusiastic", + "legitimate", + ] { + let ta = a.to_tendril(); + let tb = b.to_tendril(); + + assert_eq!(a.eq(b), ta.eq(&tb)); + assert_eq!(a.ne(b), ta.ne(&tb)); + assert_eq!(a.lt(b), ta.lt(&tb)); + assert_eq!(a.le(b), ta.le(&tb)); + assert_eq!(a.gt(b), ta.gt(&tb)); + assert_eq!(a.ge(b), ta.ge(&tb)); + assert_eq!(a.partial_cmp(b), ta.partial_cmp(&tb)); + assert_eq!(a.cmp(b), ta.cmp(&tb)); + } + } + } + + #[test] + fn extend_and_from_iterator() { + // Testing Extend and FromIterator for the various Ts. + + // Tendril + let mut t = "Hello".to_tendril(); + t.extend(None::<&Tendril<_>>.into_iter()); + assert_eq!("Hello", &*t); + t.extend(&[", ".to_tendril(), "world".to_tendril(), "!".to_tendril()]); + assert_eq!("Hello, world!", &*t); + assert_eq!( + "Hello, world!", + &*[ + "Hello".to_tendril(), + ", ".to_tendril(), + "world".to_tendril(), + "!".to_tendril() + ] + .iter() + .collect::() + ); + + // &str + let mut t = "Hello".to_tendril(); + t.extend(None::<&str>.into_iter()); + assert_eq!("Hello", &*t); + t.extend([", ", "world", "!"].iter().map(|&s| s)); + assert_eq!("Hello, world!", &*t); + assert_eq!( + "Hello, world!", + &*["Hello", ", ", "world", "!"] + .iter() + .map(|&s| s) + .collect::() + ); + + // &[u8] + let mut t = b"Hello".to_tendril(); + t.extend(None::<&[u8]>.into_iter()); + assert_eq!(b"Hello", &*t); + t.extend( + [b", ".as_ref(), b"world".as_ref(), b"!".as_ref()] + .iter() + .map(|&s| s), + ); + assert_eq!(b"Hello, world!", &*t); + assert_eq!( + b"Hello, world!", + &*[ + b"Hello".as_ref(), + b", ".as_ref(), + b"world".as_ref(), + b"!".as_ref() + ] + .iter() + .map(|&s| s) + .collect::() + ); + + let string = "the quick brown fox jumps over the lazy dog"; + let string_expected = string.to_tendril(); + let bytes = string.as_bytes(); + let bytes_expected = bytes.to_tendril(); + + // char + assert_eq!(string_expected, string.chars().collect()); + let mut tendril = StrTendril::new(); + tendril.extend(string.chars()); + assert_eq!(string_expected, tendril); + + // &u8 + assert_eq!(bytes_expected, bytes.iter().collect()); + let mut tendril = ByteTendril::new(); + tendril.extend(bytes); + assert_eq!(bytes_expected, tendril); + + // u8 + assert_eq!(bytes_expected, bytes.iter().map(|&b| b).collect()); + let mut tendril = ByteTendril::new(); + tendril.extend(bytes.iter().map(|&b| b)); + assert_eq!(bytes_expected, tendril); + } + + #[test] + fn from_str() { + use std::str::FromStr; + let t: Tendril<_> = FromStr::from_str("foo bar baz").unwrap(); + assert_eq!("foo bar baz", &*t); + } + + #[test] + fn from_char() { + assert_eq!("o", &*StrTendril::from_char('o')); + assert_eq!("ő", &*StrTendril::from_char('ő')); + assert_eq!("\u{a66e}", &*StrTendril::from_char('\u{a66e}')); + assert_eq!("\u{1f4a9}", &*StrTendril::from_char('\u{1f4a9}')); + } + + #[test] + #[cfg_attr(miri, ignore)] // slow + fn read() { + fn check(x: &[u8]) { + use std::io::Cursor; + let mut t = ByteTendril::new(); + assert_eq!(x.len(), Cursor::new(x).read_to_tendril(&mut t).unwrap()); + assert_eq!(x, &*t); + } + + check(b""); + check(b"abcd"); + + let long: Vec = iter::repeat(b'x').take(1_000_000).collect(); + check(&long); + } + + #[test] + fn hash_map_key() { + use std::collections::HashMap; + + // As noted with Borrow, indexing on HashMap is byte-based because of + // https://github.com/rust-lang/rust/issues/27108. + let mut map = HashMap::new(); + map.insert("foo".to_tendril(), 1); + assert_eq!(map.get(b"foo".as_ref()), Some(&1)); + assert_eq!(map.get(b"bar".as_ref()), None); + + let mut map = HashMap::new(); + map.insert(b"foo".to_tendril(), 1); + assert_eq!(map.get(b"foo".as_ref()), Some(&1)); + assert_eq!(map.get(b"bar".as_ref()), None); + } + + #[test] + fn atomic() { + assert_send::>(); + let s: Tendril = Tendril::from_slice("this is a string"); + assert!(!s.is_shared()); + let mut t = s.clone(); + assert!(s.is_shared()); + let sp = s.as_ptr() as usize; + thread::spawn(move || { + assert!(t.is_shared()); + t.push_slice(" extended"); + assert_eq!("this is a string extended", &*t); + assert!(t.as_ptr() as usize != sp); + assert!(!t.is_shared()); + }) + .join() + .unwrap(); + assert!(s.is_shared()); + assert_eq!("this is a string", &*s); + } + + #[test] + fn send() { + assert_send::>(); + let s = "this is a string".to_tendril(); + let t = s.clone(); + let s2 = s.into_send(); + thread::spawn(move || { + let s = StrTendril::from(s2); + assert!(!s.is_shared()); + assert_eq!("this is a string", &*s); + }) + .join() + .unwrap(); + assert_eq!("this is a string", &*t); + } + + /// https://github.com/servo/tendril/issues/58 + #[test] + fn issue_58() { + let data = "

Hello!

, World!"; + let s: Tendril = data.into(); + assert_eq!(&*s, data); + let s: Tendril = s.into_send().into(); + assert_eq!(&*s, data); + } + + #[test] + fn inline_send() { + let s = "x".to_tendril(); + let t = s.clone(); + let s2 = s.into_send(); + thread::spawn(move || { + let s = StrTendril::from(s2); + assert!(!s.is_shared()); + assert_eq!("x", &*s); + }) + .join() + .unwrap(); + assert_eq!("x", &*t); + } +} diff --git a/tendril/src/utf8_decode.rs b/tendril/src/utf8_decode.rs new file mode 100644 index 00000000..b682d57a --- /dev/null +++ b/tendril/src/utf8_decode.rs @@ -0,0 +1,98 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use fmt; +use tendril::{Atomicity, Tendril}; +use utf8; + +pub struct IncompleteUtf8(utf8::Incomplete); + +impl Tendril +where + A: Atomicity, +{ + pub fn decode_utf8_lossy(mut self, mut push_utf8: F) -> Option + where + F: FnMut(Tendril), + { + loop { + if self.is_empty() { + return None; + } + let unborrowed_result = match utf8::decode(&self) { + Ok(s) => { + debug_assert!(s.as_ptr() == self.as_ptr()); + debug_assert!(s.len() == self.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); + debug_assert!(valid_prefix.len() <= self.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); + debug_assert!(valid_prefix.len() <= self.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { push_utf8(self.reinterpret_without_validating()) } + return None; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = self.subtendril(0, valid_len as u32); + unsafe { push_utf8(subtendril.reinterpret_without_validating()) } + } + match and_then { + Ok(incomplete) => return Some(IncompleteUtf8(incomplete)), + Err(offset) => { + push_utf8(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + self.pop_front(offset as u32) + } + } + } + } + } + } +} + +impl IncompleteUtf8 { + pub fn try_complete( + &mut self, + mut input: Tendril, + mut push_utf8: F, + ) -> Result, ()> + where + A: Atomicity, + F: FnMut(Tendril), + { + let resume_at; + match self.0.try_complete(&input) { + None => return Err(()), + Some((result, rest)) => { + push_utf8(Tendril::from_slice( + result.unwrap_or(utf8::REPLACEMENT_CHARACTER), + )); + resume_at = input.len() - rest.len(); + } + } + input.pop_front(resume_at as u32); + Ok(input) + } +} diff --git a/tendril/src/util.rs b/tendril/src/util.rs new file mode 100644 index 00000000..28c55c12 --- /dev/null +++ b/tendril/src/util.rs @@ -0,0 +1,45 @@ +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::mem; +use std::{ptr, slice}; + +#[inline(always)] +pub unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) +} + +#[inline(always)] +pub unsafe fn unsafe_slice_mut<'a>( + buf: &'a mut [u8], + start: usize, + new_len: usize, +) -> &'a mut [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts_mut(buf.as_mut_ptr().offset(start as isize), new_len) +} + +#[inline(always)] +pub unsafe fn copy_and_advance(dest: &mut *mut u8, src: &[u8]) { + ptr::copy_nonoverlapping(src.as_ptr(), *dest, src.len()); + *dest = dest.offset(src.len() as isize) +} + +#[inline(always)] +pub unsafe fn copy_lifetime_mut<'a, S: ?Sized, T: ?Sized + 'a>( + _ptr: &'a mut S, + ptr: &mut T, +) -> &'a mut T { + mem::transmute(ptr) +} + +#[inline(always)] +pub unsafe fn copy_lifetime<'a, S: ?Sized, T: ?Sized + 'a>(_ptr: &'a S, ptr: &T) -> &'a T { + mem::transmute(ptr) +} From 204251a54a3105bf5792f2a2ebb9b8dba69567b7 Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Mon, 8 Sep 2025 17:00:27 +0100 Subject: [PATCH 2/5] cargo fmt --- tendril/examples/fuzz.rs | 18 +++++++++--------- tendril/src/bench.rs | 8 ++++---- tendril/src/futf.rs | 14 +++++++------- tendril/src/stream.rs | 36 ++++++++++++++++++------------------ tendril/src/tendril.rs | 18 +++++++++--------- tendril/src/utf8_decode.rs | 14 +++++++------- 6 files changed, 54 insertions(+), 54 deletions(-) diff --git a/tendril/examples/fuzz.rs b/tendril/examples/fuzz.rs index 37daf560..13a44e01 100644 --- a/tendril/examples/fuzz.rs +++ b/tendril/examples/fuzz.rs @@ -39,7 +39,7 @@ fn fuzz() { buf_string.push_str(snip); buf_tendril.push_slice(snip); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 16..=31 => { let (start, end) = random_slice(&mut rng, &buf_string); @@ -47,21 +47,21 @@ fn fuzz() { buf_string.push_str(&snip); buf_tendril.push_slice(&snip); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 32..=47 => { let lenstr = format!("[length = {}]", buf_tendril.len()); buf_string.push_str(&lenstr); buf_tendril.push_slice(&lenstr); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 48..=63 => { let n = random_boundary(&mut rng, &buf_string); buf_tendril.pop_front(n as u32); buf_string = buf_string[n..].to_owned(); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 64..=79 => { let new_len = random_boundary(&mut rng, &buf_string); @@ -69,27 +69,27 @@ fn fuzz() { buf_string.truncate(new_len); buf_tendril.pop_back(n as u32); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 80..=90 => { let (start, end) = random_slice(&mut rng, &buf_string); buf_string = buf_string[start..end].to_owned(); buf_tendril = buf_tendril.subtendril(start as u32, (end - start) as u32); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 91..=96 => { let c = rng.gen(); buf_string.push(c); assert!(buf_tendril.try_push_char(c).is_ok()); assert_eq!(&*buf_string, &*buf_tendril); - } + }, 97 => { buf_string.truncate(0); buf_tendril.clear(); assert_eq!(&*buf_string, &*buf_tendril); - } + }, _ => { let (start, end) = random_slice(&mut rng, &buf_string); @@ -100,7 +100,7 @@ fn fuzz() { .iter() .zip(tendril_slices.iter()) .all(|(s, t)| **s == **t)); - } + }, } } } diff --git a/tendril/src/bench.rs b/tendril/src/bench.rs index a9d2c30a..ca2341ab 100644 --- a/tendril/src/bench.rs +++ b/tendril/src/bench.rs @@ -20,10 +20,10 @@ fn index_words_string(input: &String) -> HashMap> { Entry::Occupied(mut e) => { let x: &mut Vec = e.get_mut(); x.push(word); - } + }, Entry::Vacant(e) => { e.insert(vec![word]); - } + }, } } index @@ -39,10 +39,10 @@ fn index_words_tendril(input: &StrTendril) -> HashMap> { Some((word, true)) => match index.entry(word.chars().next().unwrap()) { Entry::Occupied(mut e) => { e.get_mut().push(word); - } + }, Entry::Vacant(e) => { e.insert(vec![word]); - } + }, }, } } diff --git a/tendril/src/futf.rs b/tendril/src/futf.rs index 93a1c21e..013e7ca6 100644 --- a/tendril/src/futf.rs +++ b/tendril/src/futf.rs @@ -106,7 +106,7 @@ unsafe fn decode(buf: &[u8]) -> Option { if n < 0x80 { return None; } // Overlong - } + }, 3 => { n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 @@ -115,9 +115,9 @@ unsafe fn decode(buf: &[u8]) -> Option { 0x0000..=0x07FF => return None, // Overlong 0xD800..=0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), 0xDC00..=0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), - _ => {} + _ => {}, } - } + }, 4 => { n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 @@ -126,7 +126,7 @@ unsafe fn decode(buf: &[u8]) -> Option { if n < 0x1_0000 { return None; } // Overlong - } + }, _ => debug_unreachable!(), } @@ -185,7 +185,7 @@ pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { meaning: Meaning::Prefix(n - avail), }) } - } + }, Byte::Cont => { let mut start = idx; let mut checked = 0; @@ -225,7 +225,7 @@ pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { meaning: Meaning::Prefix(n - avail), }); } - } + }, _ => return None, } @@ -235,7 +235,7 @@ pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { return None; } } - } + }, } } } diff --git a/tendril/src/stream.rs b/tendril/src/stream.rs index 469d58c9..afac7bc9 100644 --- a/tendril/src/stream.rs +++ b/tendril/src/stream.rs @@ -94,8 +94,8 @@ where tendril.pop_back(BUFFER_SIZE - n as u32); self.process(tendril); break; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + }, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}, Err(e) => return Err(e), } } @@ -162,7 +162,7 @@ where self.inner_sink.error("invalid byte sequence".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); - } + }, } t.len() - rest.len() }); @@ -170,7 +170,7 @@ where None => { self.incomplete = Some(incomplete); return; - } + }, Some(resume_at) => t.pop_front(resume_at as u32), } } @@ -180,7 +180,7 @@ where debug_assert!(s.as_ptr() == t.as_ptr()); debug_assert!(s.len() == t.len()); Ok(()) - } + }, Err(utf8::DecodeError::Invalid { valid_prefix, invalid_sequence, @@ -192,7 +192,7 @@ where valid_prefix.len(), Err(valid_prefix.len() + invalid_sequence.len()), )) - } + }, Err(utf8::DecodeError::Incomplete { valid_prefix, incomplete_suffix, @@ -200,13 +200,13 @@ where debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); debug_assert!(valid_prefix.len() <= t.len()); Err((valid_prefix.len(), Ok(incomplete_suffix))) - } + }, }; match unborrowed_result { Ok(()) => { unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } return; - } + }, Err((valid_len, and_then)) => { if valid_len > 0 { let subtendril = t.subtendril(0, valid_len as u32); @@ -219,15 +219,15 @@ where Ok(incomplete) => { self.incomplete = Some(incomplete); return; - } + }, Err(offset) => { self.inner_sink.error("invalid byte sequence".into()); self.inner_sink .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); t.pop_front(offset as u32); - } + }, } - } + }, } } } @@ -365,21 +365,21 @@ where debug_assert!(err.upto >= 0); t.pop_front(err.upto as u32); // continue loop and process remainder of t - } + }, (_, None) => break, } } if out.len() > 0 { sink.process(out); } - } + }, #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { if t.is_empty() { return; } decode_to_sink(t, decoder, sink, false); - } + }, } } @@ -411,12 +411,12 @@ where sink.process(out); } sink.finish() - } + }, #[cfg(feature = "encoding_rs")] LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); sink.finish() - } + }, } } } @@ -449,11 +449,11 @@ fn decode_to_sink( } match result { DecoderResult::InputEmpty => return, - DecoderResult::OutputFull => {} + DecoderResult::OutputFull => {}, DecoderResult::Malformed(_, _) => { sink.error(Cow::Borrowed("invalid sequence")); sink.process("\u{FFFD}".into()); - } + }, } t.pop_front(bytes_read as u32); if t.is_empty() { diff --git a/tendril/src/tendril.rs b/tendril/src/tendril.rs index 0a33d827..d7561996 100644 --- a/tendril/src/tendril.rs +++ b/tendril/src/tendril.rs @@ -1155,7 +1155,7 @@ where self, unsafe_slice(buf.data(), offset as usize, self.len32() as usize), ) - } + }, } } } @@ -1173,7 +1173,7 @@ where let (mut buf, _, offset) = self.assume_buf(); let len = self.len32() as usize; copy_lifetime_mut(self, unsafe_slice_mut(buf.data_mut(), offset as usize, len)) - } + }, } } } @@ -1301,10 +1301,10 @@ where if let Some((n, _)) = iter.next() { skip = n as u32; } - } + }, None => { next_char = None; - } + }, } } @@ -1346,7 +1346,7 @@ where let t = self.clone(); self.clear(); Some((t, class)) - } + }, } } @@ -1400,13 +1400,13 @@ where Ok(0) => { ret = Ok(len - start_len); break; - } + }, Ok(n) => len += n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}, Err(e) => { ret = Err(e); break; - } + }, } } @@ -1999,7 +1999,7 @@ mod test { let (rt, rc) = res.unwrap(); assert_eq!(es, &*rt); assert_eq!(ec, rc); - } + }, } } } diff --git a/tendril/src/utf8_decode.rs b/tendril/src/utf8_decode.rs index b682d57a..16d98802 100644 --- a/tendril/src/utf8_decode.rs +++ b/tendril/src/utf8_decode.rs @@ -27,7 +27,7 @@ where debug_assert!(s.as_ptr() == self.as_ptr()); debug_assert!(s.len() == self.len()); Ok(()) - } + }, Err(utf8::DecodeError::Invalid { valid_prefix, invalid_sequence, @@ -39,7 +39,7 @@ where valid_prefix.len(), Err(valid_prefix.len() + invalid_sequence.len()), )) - } + }, Err(utf8::DecodeError::Incomplete { valid_prefix, incomplete_suffix, @@ -47,13 +47,13 @@ where debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); debug_assert!(valid_prefix.len() <= self.len()); Err((valid_prefix.len(), Ok(incomplete_suffix))) - } + }, }; match unborrowed_result { Ok(()) => { unsafe { push_utf8(self.reinterpret_without_validating()) } return None; - } + }, Err((valid_len, and_then)) => { if valid_len > 0 { let subtendril = self.subtendril(0, valid_len as u32); @@ -64,9 +64,9 @@ where Err(offset) => { push_utf8(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); self.pop_front(offset as u32) - } + }, } - } + }, } } } @@ -90,7 +90,7 @@ impl IncompleteUtf8 { result.unwrap_or(utf8::REPLACEMENT_CHARACTER), )); resume_at = input.len() - rest.len(); - } + }, } input.pop_front(resume_at as u32); Ok(input) From 9acddfd5e500cd71463a611afd5d9069d535473d Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Mon, 8 Sep 2025 17:05:24 +0100 Subject: [PATCH 3/5] Ignore warnings and clippy lints Signed-off-by: Nico Burns --- tendril/examples/fuzz.rs | 3 +++ tendril/src/lib.rs | 27 +++++++++++++++++++++++++++ tendril/src/stream.rs | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tendril/examples/fuzz.rs b/tendril/examples/fuzz.rs index 13a44e01..df14c453 100644 --- a/tendril/examples/fuzz.rs +++ b/tendril/examples/fuzz.rs @@ -7,6 +7,9 @@ //! A simple fuzz tester for the library. #![deny(warnings)] +#![allow(clippy::redundant_static_lifetimes)] +#![allow(clippy::needless_borrow)] +#![allow(clippy::borrow_deref_ref)] extern crate rand; extern crate tendril; diff --git a/tendril/src/lib.rs b/tendril/src/lib.rs index fadc2cab..a5c7a39b 100644 --- a/tendril/src/lib.rs +++ b/tendril/src/lib.rs @@ -6,6 +6,33 @@ #![cfg_attr(all(test, feature = "bench"), feature(test))] //#![cfg_attr(test, deny(warnings))] +#![allow(unnecessary_transmutes)] +#![allow(bare_trait_objects)] +#![allow(clippy::ptr_offset_with_cast)] +#![allow(clippy::needless_lifetimes)] +#![allow(clippy::needless_late_init)] +#![allow(clippy::explicit_auto_deref)] +#![allow(clippy::result_unit_err)] +#![allow(clippy::op_ref)] +#![allow(clippy::missing_safety_doc)] +#![allow(clippy::missing_transmute_annotations)] +#![allow(clippy::partialeq_ne_impl)] +#![allow(clippy::legacy_numeric_constants)] +#![allow(clippy::collapsible_if)] +#![allow(clippy::wrong_self_convention)] +#![allow(clippy::len_zero)] +#![allow(clippy::transmute_bytes_to_str)] +#![allow(clippy::match_like_matches_macro)] +#![allow(clippy::redundant_static_lifetimes)] +#![allow(clippy::redundant_field_names)] +#![allow(clippy::unusual_byte_groupings)] +#![allow(clippy::borrow_deref_ref)] +#![allow(clippy::needless_return)] +#![allow(clippy::while_let_loop)] +#![allow(clippy::mutable_key_type)] +#![allow(clippy::manual_repeat_n)] +#![allow(clippy::map_clone)] +#![allow(clippy::useless_conversion)] #[macro_use] extern crate debug_unreachable; diff --git a/tendril/src/stream.rs b/tendril/src/stream.rs index afac7bc9..45183c5d 100644 --- a/tendril/src/stream.rs +++ b/tendril/src/stream.rs @@ -605,7 +605,7 @@ mod test { #[cfg(any(feature = "encoding", feature = "encoding_rs"))] pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; - #[cfg(any(feature = "encoding"))] + #[cfg(feature = "encoding")] const ASCII: Tests = &[ (&[], "", 0), (&[b""], "", 0), From e783cb181ead5a92861537800f60d6f7f85af811 Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Mon, 8 Sep 2025 19:12:48 +0100 Subject: [PATCH 4/5] Port tendril benchmarks to criterion Signed-off-by: Nico Burns --- tendril/Cargo.toml | 13 ++- tendril/benches/futf.rs | 66 +++++++++++++++ tendril/benches/tendril.rs | 163 +++++++++++++++++++++++++++++++++++++ tendril/src/bench.rs | 159 ------------------------------------ tendril/src/futf.rs | 58 +------------ tendril/src/lib.rs | 8 +- tendril/src/tendril.rs | 4 - 7 files changed, 245 insertions(+), 226 deletions(-) create mode 100644 tendril/benches/futf.rs create mode 100644 tendril/benches/tendril.rs delete mode 100644 tendril/src/bench.rs diff --git a/tendril/Cargo.toml b/tendril/Cargo.toml index c424ff56..14dae0d1 100644 --- a/tendril/Cargo.toml +++ b/tendril/Cargo.toml @@ -22,6 +22,15 @@ utf-8 = { workspace = true } [dev-dependencies] rand = { workspace = true } +criterion = { workspace = true } +tendril = { workspace = true } + +[[bench]] +name = "futf" +harness = false + +[[bench]] +name = "tendril" +harness = false + -[features] -bench = [] diff --git a/tendril/benches/futf.rs b/tendril/benches/futf.rs new file mode 100644 index 00000000..312fee52 --- /dev/null +++ b/tendril/benches/futf.rs @@ -0,0 +1,66 @@ +extern crate criterion; +extern crate tendril; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use tendril::futf::classify; + +static TEXT: &str = " + All human beings are born free and equal in dignity and rights. + They are endowed with reason and conscience and should act + towards one another in a spirit of brotherhood. + + Minden emberi lény szabadon születik és egyenlő méltósága és + joga van. Az emberek, ésszel és lelkiismerettel bírván, + egymással szemben testvéri szellemben kell hogy viseltessenek. + + เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง + เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน. + + 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 + 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 + 형제애의 정신으로 행동하여야 한다. + + ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a + .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei + jeseki'ubo ry. simyzu'e ta'i le tunba + + ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ + ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ."; + +// random +static IXES: &[usize] = &[ + 778, 156, 87, 604, 1216, 365, 884, 311, 469, 515, 709, 162, 871, 206, 634, 442, +]; + +static BOUNDARY: &[bool] = &[ + false, true, true, false, false, true, true, true, true, false, false, true, true, true, false, + false, +]; + +fn std_utf8_check(b: &mut Bencher) { + b.iter(|| { + assert!(IXES + .iter() + .zip(BOUNDARY.iter()) + .all(|(&ix, &expect)| { expect == TEXT.is_char_boundary(ix) })); + }); +} + +// We don't expect to be as fast as is_char_boundary, because we provide more +// information. But we shouldn't be tremendously slower, either. A factor of +// 5-10 is expected on this text. +fn futf_check(b: &mut Bencher) { + b.iter(|| { + assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| { + expect == (classify(TEXT.as_bytes(), ix).unwrap().rewind == 0) + })); + }); +} + +fn tendril_benchmarks(c: &mut Criterion) { + c.bench_function("std_utf8_check", std_utf8_check); + c.bench_function("futf_check", futf_check); +} + +criterion_group!(benches, tendril_benchmarks); +criterion_main!(benches); diff --git a/tendril/benches/tendril.rs b/tendril/benches/tendril.rs new file mode 100644 index 00000000..749ba5c6 --- /dev/null +++ b/tendril/benches/tendril.rs @@ -0,0 +1,163 @@ +// // Licensed under the Apache License, Version 2.0 or the MIT license +// // , at your +// // option. This file may not be copied, modified, or distributed +// // except according to those terms. + +// use std::borrow::ToOwned; +// use std::collections::hash_map::{Entry, HashMap}; + +#![allow(clippy::manual_pattern_char_comparison)] + +extern crate criterion; +extern crate tendril; +use std::collections::{hash_map::Entry, HashMap}; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use tendril::StrTendril; + +static EN_1: &str = "Days turn to nights turn to paper into rocks into plastic"; + +static EN_2: &str = "Here the notes in my laboratory journal cease. I was able to write the last \ + words only with great effort. By now it was already clear to me that LSD had \ + been the cause of the remarkable experience of the previous Friday, for the \ + altered perceptions were of the same type as before, only much more intense. I \ + had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ + informed of the self-experiment, to escort me home. We went by bicycle, no \ + automobile being available because of wartime restrictions on their use. On the \ + way home, my condition began to assume threatening forms. Everything in my \ + field of vision wavered and was distorted as if seen in a curved mirror. I also \ + had the sensation of being unable to move from the spot. Nevertheless, my \ + assistant later told me that we had traveled very rapidly. Finally, we arrived \ + at home safe and sound, and I was just barely capable of asking my companion to \ + summon our family doctor and request milk from the neighbors.\n\n\ + In spite of my delirious, bewildered condition, I had brief periods of clear \ + and effective thinking—and chose milk as a nonspecific antidote for poisoning."; + +static KR_1: &str = "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ + 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ + 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; + +static HTML_KR_1: &str = "

러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, \ + 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ + 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.

"; + +const SMALL_SIZE: usize = 65536; +const LARGE_SIZE: usize = 1 << 20; + +fn index_words_string(input: &str) -> HashMap> { + let mut index = HashMap::new(); + for word in input.split(|c| c == ' ') { + if word.is_empty() { + continue; + } + let word = word.to_owned(); + match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + let x: &mut Vec = e.get_mut(); + x.push(word); + }, + Entry::Vacant(e) => { + e.insert(vec![word]); + }, + } + } + index +} + +fn index_words_tendril(input: &StrTendril) -> HashMap> { + let mut index = HashMap::new(); + let mut t = input.clone(); + loop { + match t.pop_front_char_run(|c| c != ' ') { + None => return index, + Some((_, false)) => (), + Some((word, true)) => match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + e.get_mut().push(word); + }, + Entry::Vacant(e) => { + e.insert(vec![word]); + }, + }, + } + } +} + +fn test_correctness(txt: &str) { + use std::borrow::ToOwned; + use tendril::SliceExt; + + let input_string = txt.to_owned(); + let count_s = index_words_string(&input_string); + let mut keys: Vec = count_s.keys().cloned().collect(); + keys.sort(); + + let input_tendril = txt.to_tendril(); + let count_t = index_words_tendril(&input_tendril); + let mut keys_t: Vec = count_t.keys().cloned().collect(); + keys_t.sort(); + + assert_eq!(keys, keys_t); + + for k in &keys { + let vs = &count_s[k]; + let vt = &count_t[k]; + assert_eq!(vs.len(), vt.len()); + assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); + } +} + +fn index_words_small_string(b: &mut Bencher, txt: &str) { + let mut s = String::new(); + while s.len() < SMALL_SIZE { + s.push_str(txt); + } + b.iter(|| index_words_string(&s)); +} + +fn index_words_small_tendril(b: &mut Bencher, txt: &str) { + let mut t = StrTendril::new(); + while t.len() < SMALL_SIZE { + t.push_slice(txt); + } + b.iter(|| index_words_tendril(&t)); +} + +fn index_words_big_string(b: &mut Bencher, txt: &str) { + let mut s = String::new(); + while s.len() < LARGE_SIZE { + s.push_str(txt); + } + b.iter(|| index_words_string(&s)); +} + +fn index_words_big_tendril(b: &mut Bencher, txt: &str) { + let mut t = StrTendril::new(); + while t.len() < LARGE_SIZE { + t.push_slice(txt); + } + b.iter(|| index_words_tendril(&t)); +} + +fn run_bench_group(c: &mut Criterion, group_name: &str, txt: &str) { + let mut group = c.benchmark_group(group_name); + + test_correctness(txt); + + group.bench_with_input("index_words_small_string", txt, index_words_small_string); + group.bench_with_input("index_words_small_tendril", txt, index_words_small_tendril); + group.bench_with_input("index_words_big_string", txt, index_words_big_string); + group.bench_with_input("index_words_big_tendril", txt, index_words_big_tendril); +} + +fn tendril_benchmarks(c: &mut Criterion) { + run_bench_group(c, "en_1", EN_1); + run_bench_group(c, "en_2", EN_2); + run_bench_group(c, "kr_1", KR_1); + run_bench_group(c, "html_kr_1", HTML_KR_1); +} + +criterion_group!(benches, tendril_benchmarks); +criterion_main!(benches); diff --git a/tendril/src/bench.rs b/tendril/src/bench.rs deleted file mode 100644 index ca2341ab..00000000 --- a/tendril/src/bench.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::borrow::ToOwned; -use std::collections::hash_map::{Entry, HashMap}; - -use tendril::StrTendril; - -fn index_words_string(input: &String) -> HashMap> { - let mut index = HashMap::new(); - for word in input.split(|c| c == ' ') { - if word.len() == 0 { - continue; - } - let word = word.to_owned(); - match index.entry(word.chars().next().unwrap()) { - Entry::Occupied(mut e) => { - let x: &mut Vec = e.get_mut(); - x.push(word); - }, - Entry::Vacant(e) => { - e.insert(vec![word]); - }, - } - } - index -} - -fn index_words_tendril(input: &StrTendril) -> HashMap> { - let mut index = HashMap::new(); - let mut t = input.clone(); - loop { - match t.pop_front_char_run(|c| c != ' ') { - None => return index, - Some((_, false)) => (), - Some((word, true)) => match index.entry(word.chars().next().unwrap()) { - Entry::Occupied(mut e) => { - e.get_mut().push(word); - }, - Entry::Vacant(e) => { - e.insert(vec![word]); - }, - }, - } - } -} - -static EN_1: &'static str = "Days turn to nights turn to paper into rocks into plastic"; - -static EN_2: &'static str = - "Here the notes in my laboratory journal cease. I was able to write the last \ - words only with great effort. By now it was already clear to me that LSD had \ - been the cause of the remarkable experience of the previous Friday, for the \ - altered perceptions were of the same type as before, only much more intense. I \ - had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ - informed of the self-experiment, to escort me home. We went by bicycle, no \ - automobile being available because of wartime restrictions on their use. On the \ - way home, my condition began to assume threatening forms. Everything in my \ - field of vision wavered and was distorted as if seen in a curved mirror. I also \ - had the sensation of being unable to move from the spot. Nevertheless, my \ - assistant later told me that we had traveled very rapidly. Finally, we arrived \ - at home safe and sound, and I was just barely capable of asking my companion to \ - summon our family doctor and request milk from the neighbors.\n\n\ - In spite of my delirious, bewildered condition, I had brief periods of clear \ - and effective thinking—and chose milk as a nonspecific antidote for poisoning."; - -static KR_1: &'static str = - "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ - 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ - 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; - -static HTML_KR_1: &'static str = - "

러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, \ - 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ - 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.

"; - -mod index_words { - macro_rules! bench { - ($txt:ident) => { - #[allow(non_snake_case)] - mod $txt { - const SMALL_SIZE: usize = 65536; - const LARGE_SIZE: usize = (1 << 20); - - #[bench] - fn index_words_string(b: &mut ::test::Bencher) { - let mut s = String::new(); - while s.len() < SMALL_SIZE { - s.push_str(::tendril::bench::$txt); - } - b.iter(|| ::tendril::bench::index_words_string(&s)); - } - - #[bench] - fn index_words_tendril(b: &mut ::test::Bencher) { - let mut t = ::tendril::StrTendril::new(); - while t.len() < SMALL_SIZE { - t.push_slice(::tendril::bench::$txt); - } - b.iter(|| ::tendril::bench::index_words_tendril(&t)); - } - - #[bench] - fn index_words_big_string(b: &mut ::test::Bencher) { - let mut s = String::new(); - while s.len() < LARGE_SIZE { - s.push_str(::tendril::bench::$txt); - } - b.iter(|| ::tendril::bench::index_words_string(&s)); - } - - #[bench] - fn index_words_big_tendril(b: &mut ::test::Bencher) { - let mut t = ::tendril::StrTendril::new(); - while t.len() < LARGE_SIZE { - t.push_slice(::tendril::bench::$txt); - } - b.iter(|| ::tendril::bench::index_words_tendril(&t)); - } - - #[test] - fn correctness() { - use std::borrow::ToOwned; - use tendril::bench::{index_words_string, index_words_tendril}; - use tendril::SliceExt; - - let txt = ::tendril::bench::$txt; - let input_string = txt.to_owned(); - let count_s = index_words_string(&input_string); - let mut keys: Vec = count_s.keys().cloned().collect(); - keys.sort(); - - let input_tendril = txt.to_tendril(); - let count_t = index_words_tendril(&input_tendril); - let mut keys_t: Vec = count_t.keys().cloned().collect(); - keys_t.sort(); - - assert_eq!(keys, keys_t); - - for k in &keys { - let vs = &count_s[k]; - let vt = &count_t[k]; - assert_eq!(vs.len(), vt.len()); - assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); - } - } - } - }; - } - - bench!(EN_1); - bench!(EN_2); - bench!(KR_1); - bench!(HTML_KR_1); -} diff --git a/tendril/src/futf.rs b/tendril/src/futf.rs index 013e7ca6..5fac52d5 100644 --- a/tendril/src/futf.rs +++ b/tendril/src/futf.rs @@ -240,12 +240,11 @@ pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { } } -#[cfg(all(test, feature = "bench"))] +#[cfg(test)] mod tests { use super::{all_cont, classify, decode, Byte, Meaning}; use std::borrow::ToOwned; use std::io::Write; - use test::Bencher; #[test] fn classify_all_bytes() { @@ -507,59 +506,4 @@ mod tests { assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i)); } } - - static TEXT: &'static str = " - All human beings are born free and equal in dignity and rights. - They are endowed with reason and conscience and should act - towards one another in a spirit of brotherhood. - - Minden emberi lény szabadon születik és egyenlő méltósága és - joga van. Az emberek, ésszel és lelkiismerettel bírván, - egymással szemben testvéri szellemben kell hogy viseltessenek. - - เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง - เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน. - - 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 - 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 - 형제애의 정신으로 행동하여야 한다. - - ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a - .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei - jeseki'ubo ry. simyzu'e ta'i le tunba - - ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ - ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ."; - - // random - static IXES: &'static [usize] = &[ - 778, 156, 87, 604, 1216, 365, 884, 311, 469, 515, 709, 162, 871, 206, 634, 442, - ]; - - static BOUNDARY: &'static [bool] = &[ - false, true, true, false, false, true, true, true, true, false, false, true, true, true, - false, false, - ]; - - #[bench] - fn std_utf8_check(b: &mut Bencher) { - b.iter(|| { - assert!(IXES - .iter() - .zip(BOUNDARY.iter()) - .all(|(&ix, &expect)| { expect == TEXT.is_char_boundary(ix) })); - }); - } - - // We don't expect to be as fast as is_char_boundary, because we provide more - // information. But we shouldn't be tremendously slower, either. A factor of - // 5-10 is expected on this text. - #[bench] - fn futf_check(b: &mut Bencher) { - b.iter(|| { - assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| { - expect == (classify(TEXT.as_bytes(), ix).unwrap().rewind == 0) - })); - }); - } } diff --git a/tendril/src/lib.rs b/tendril/src/lib.rs index a5c7a39b..2d9f8d10 100644 --- a/tendril/src/lib.rs +++ b/tendril/src/lib.rs @@ -4,7 +4,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![cfg_attr(all(test, feature = "bench"), feature(test))] //#![cfg_attr(test, deny(warnings))] #![allow(unnecessary_transmutes)] #![allow(bare_trait_objects)] @@ -40,8 +39,6 @@ extern crate debug_unreachable; pub extern crate encoding; #[cfg(feature = "encoding_rs")] pub extern crate encoding_rs; -#[cfg(all(test, feature = "bench"))] -extern crate test; #[macro_use] extern crate mac; extern crate utf8; @@ -56,9 +53,12 @@ pub mod fmt; pub mod stream; mod buf32; -mod futf; mod tendril; mod utf8_decode; mod util; +// Exposed for benchmarking purposes only +#[doc(hidden)] +pub mod futf; + static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; diff --git a/tendril/src/tendril.rs b/tendril/src/tendril.rs index d7561996..d5fbd7d2 100644 --- a/tendril/src/tendril.rs +++ b/tendril/src/tendril.rs @@ -1655,10 +1655,6 @@ where } } -#[cfg(all(test, feature = "bench"))] -#[path = "bench.rs"] -mod bench; - #[cfg(test)] mod test { use super::{ From 3b6b60c7fe9831b867194b0b341cf7778b60257b Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Mon, 8 Sep 2025 20:08:36 +0100 Subject: [PATCH 5/5] Add tendril to RELEASING.MD Signed-off-by: Nico Burns --- RELEASING.MD | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/RELEASING.MD b/RELEASING.MD index fa5940be..816b1595 100644 --- a/RELEASING.MD +++ b/RELEASING.MD @@ -11,6 +11,8 @@ published to crates.io. The **web_atoms** crate is on a separate cycle as it needs frequent releases but these rarely contain breaking changes. +The **tendril** crate is on a separate cycle as it is a utility crate that is rarely updated. + ## Making a release of **web_atoms**: - Bump the version in `web_atoms/Cargo.toml` @@ -20,6 +22,13 @@ The **web_atoms** crate is on a separate cycle as it needs frequent releases but - Publish the new version of **web_atoms** - Optionally: publish a new version of the other crates to match +## Making a release of **tendril**: + +- Bump the version in `tendril/Cargo.toml` +- Update the version **tendril** in the workspace `Cargo.toml`'s `[workspace.dependencies]` section to match +- Publish the new version of **tendril** +- Optionally: publish a new version of the other crates to match + ## Making a release of all other crates In the workspace `Cargo.toml`: