Skip to content

Commit

Permalink
Remove dependency on the utf-8 crate
Browse files Browse the repository at this point in the history
I’m archiving that repository, so vendor a copy of the relevant code instead.
  • Loading branch information
SimonSapin committed Jan 21, 2023
1 parent f882b60 commit 0cd5c72
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 2 deletions.
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ mac = "0.1"
encoding = {version = "0.2", optional = true}
encoding_rs = {version = "0.8.12", optional = true}
futf = "0.1.5"
utf-8 = "0.7"

[dev-dependencies]
rand = "0.4"
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ extern crate test;
#[macro_use]
extern crate mac;
extern crate futf;
extern crate utf8;

pub use fmt::Format;
pub use stream::TendrilSink;
Expand All @@ -31,5 +30,6 @@ mod buf32;
mod tendril;
mod utf8_decode;
mod util;
mod utf8;

static OFLOW: &'static str = "tendril: overflow in buffer arithmetic";
135 changes: 135 additions & 0 deletions src/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
use std::cmp;
use std::str;

/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";

#[derive(Debug, Copy, Clone)]
pub enum DecodeError<'a> {
/// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
/// then call `decode()` again with `remaining_input`.
Invalid {
valid_prefix: &'a str,
invalid_sequence: &'a [u8],
remaining_input: &'a [u8],
},

/// Call the `incomplete_suffix.try_complete` method with more input when available.
/// If no more input is available, this is an invalid byte sequence.
Incomplete {
valid_prefix: &'a str,
incomplete_suffix: Incomplete,
},
}

#[derive(Debug, Copy, Clone)]
pub struct Incomplete {
pub buffer: [u8; 4],
pub buffer_len: u8,
}

pub fn decode(input: &[u8]) -> Result<&str, DecodeError> {
let error = match str::from_utf8(input) {
Ok(valid) => return Ok(valid),
Err(error) => error,
};

// FIXME: separate function from here to guide inlining?
let (valid, after_valid) = input.split_at(error.valid_up_to());
let valid = unsafe {
str::from_utf8_unchecked(valid)
};

match error.error_len() {
Some(invalid_sequence_length) => {
let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
Err(DecodeError::Invalid {
valid_prefix: valid,
invalid_sequence: invalid,
remaining_input: rest
})
}
None => {
Err(DecodeError::Incomplete {
valid_prefix: valid,
incomplete_suffix: Incomplete::new(after_valid),
})
}
}
}

impl Incomplete {
fn new(bytes: &[u8]) -> Self {
let mut buffer = [0, 0, 0, 0];
let len = bytes.len();
buffer[..len].copy_from_slice(bytes);
Incomplete {
buffer: buffer,
buffer_len: len as u8,
}
}

/// * `None`: still incomplete, call `try_complete` again with more input.
/// If no more input is available, this is invalid byte sequence.
/// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
/// To keep decoding, pass `remaining_input` to `decode()`.
pub fn try_complete<'input>(&mut self, input: &'input [u8])
-> Option<(Result<&str, &[u8]>, &'input [u8])> {
let (consumed, opt_result) = self.try_complete_offsets(input);
let result = opt_result?;
let remaining_input = &input[consumed..];
let result_bytes = self.take_buffer();
let result = match result {
Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
Err(()) => Err(result_bytes),
};
Some((result, remaining_input))
}

fn take_buffer(&mut self) -> &[u8] {
let len = self.buffer_len as usize;
self.buffer_len = 0;
&self.buffer[..len as usize]
}

/// (consumed_from_input, None): not enough input
/// (consumed_from_input, Some(Err(()))): error bytes in buffer
/// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
let initial_buffer_len = self.buffer_len as usize;
let copied_from_input;
{
let unwritten = &mut self.buffer[initial_buffer_len..];
copied_from_input = cmp::min(unwritten.len(), input.len());
unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
}
let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
match str::from_utf8(spliced) {
Ok(_) => {
self.buffer_len = spliced.len() as u8;
(copied_from_input, Some(Ok(())))
}
Err(error) => {
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
self.buffer_len = valid_up_to as u8;
(consumed, Some(Ok(())))
} else {
match error.error_len() {
Some(invalid_sequence_length) => {
let consumed = invalid_sequence_length
.checked_sub(initial_buffer_len).unwrap();
self.buffer_len = invalid_sequence_length as u8;
(consumed, Some(Err(())))
}
None => {
self.buffer_len = spliced.len() as u8;
(copied_from_input, None)
}
}
}
}
}
}
}

0 comments on commit 0cd5c72

Please sign in to comment.