Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
* Impl `Display`, `Eq`, `PartialEq`, `Ord`, `PartialOrd`, and `Hash` for
the `Error` type.
* Switch to the 2021 edition.
* Add `ucs2_cstr!` macro.
109 changes: 71 additions & 38 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
#![deny(missing_docs)]
#![deny(clippy::all)]

mod macros;

/// These need to be public for the `ucs2_cstr!` macro, but are not
/// intended to be called directly.
#[doc(hidden)]
pub use macros::{str_num_ucs2_chars, str_to_ucs2};

use bit_field::BitField;
use core::fmt::{self, Display, Formatter};

Expand All @@ -29,6 +36,66 @@ impl Display for Error {

type Result<T> = core::result::Result<T, Error>;

/// Value returned by `ucs2_from_utf8_at_offset`.
struct Ucs2CharFromUtf8 {
/// UCS-2 character.
val: u16,
/// Number of bytes needed to encode the character in UTF-8.
num_bytes: u8,
}

/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
///
/// # Safety
///
/// The input `bytes` must be valid UTF-8.
const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
let len = bytes.len();
let ch;
let ch_len;

if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
ch = bytes[offset] as u16;
ch_len = 1;
} else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
// 2 byte codepoint
if offset + 1 >= len {
// safe: len is the length of bytes,
// and bytes is a direct view into the
// buffer of input, which in order to be a valid
// utf-8 string _must_ contain `i + 1`.
unsafe { core::hint::unreachable_unchecked() }
}

let a = (bytes[offset] & 0b0001_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
ch = a << 6 | b;
ch_len = 2;
} else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
// 3 byte codepoint
if offset + 2 >= len || offset + 1 >= len {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

let a = (bytes[offset] & 0b0000_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
let c = (bytes[offset + 2] & 0b0011_1111) as u16;
ch = a << 12 | b << 6 | c;
ch_len = 3;
} else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
return Err(Error::MultiByte); // UTF-16
} else {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

Ok(Ucs2CharFromUtf8 {
val: ch,
num_bytes: ch_len,
})
}

/// Encodes an input UTF-8 string into a UCS-2 string.
///
/// The returned `usize` represents the length of the returned buffer,
Expand Down Expand Up @@ -62,44 +129,10 @@ where
let mut i = 0;

while i < len {
let ch;

if bytes[i] & 0b1000_0000 == 0b0000_0000 {
ch = u16::from(bytes[i]);
i += 1;
} else if bytes[i] & 0b1110_0000 == 0b1100_0000 {
// 2 byte codepoint
if i + 1 >= len {
// safe: len is the length of bytes,
// and bytes is a direct view into the
// buffer of input, which in order to be a valid
// utf-8 string _must_ contain `i + 1`.
unsafe { core::hint::unreachable_unchecked() }
}

let a = u16::from(bytes[i] & 0b0001_1111);
let b = u16::from(bytes[i + 1] & 0b0011_1111);
ch = a << 6 | b;
i += 2;
} else if bytes[i] & 0b1111_0000 == 0b1110_0000 {
// 3 byte codepoint
if i + 2 >= len || i + 1 >= len {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

let a = u16::from(bytes[i] & 0b0000_1111);
let b = u16::from(bytes[i + 1] & 0b0011_1111);
let c = u16::from(bytes[i + 2] & 0b0011_1111);
ch = a << 12 | b << 6 | c;
i += 3;
} else if bytes[i] & 0b1111_0000 == 0b1111_0000 {
return Err(Error::MultiByte); // UTF-16
} else {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}
output(ch)?;
// SAFETY: `bytes` is valid UTF-8.
let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
i += usize::from(ch.num_bytes);
output(ch.val)?;
}
Ok(())
}
Expand Down
126 changes: 126 additions & 0 deletions src/macros.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
use crate::{ucs2_from_utf8_at_offset, Error};

/// Count the number of UCS-2 characters in a string. Return an error if
/// the string cannot be encoded in UCS-2.
pub const fn str_num_ucs2_chars(s: &str) -> Result<usize, Error> {
let bytes = s.as_bytes();
let len = bytes.len();

let mut offset = 0;
let mut num_ucs2_chars = 0;

while offset < len {
// SAFETY: `bytes` is valid UTF-8.
match unsafe { ucs2_from_utf8_at_offset(bytes, offset) } {
Ok(ch) => {
offset += ch.num_bytes as usize;
num_ucs2_chars += 1;
}
Err(err) => {
return Err(err);
}
}
}

Ok(num_ucs2_chars)
}

/// Convert a `str` into a null-terminated UCS-2 character array.
pub const fn str_to_ucs2<const N: usize>(s: &str) -> Result<[u16; N], Error> {
let bytes = s.as_bytes();
let len = bytes.len();

let mut output = [0; N];

let mut output_offset = 0;
let mut input_offset = 0;
while input_offset < len {
// SAFETY: `bytes` is valid UTF-8.
match unsafe { ucs2_from_utf8_at_offset(bytes, input_offset) } {
Ok(ch) => {
if ch.val == 0 {
panic!("interior null character");
} else {
output[output_offset] = ch.val;
output_offset += 1;
input_offset += ch.num_bytes as usize;
}
}
Err(err) => {
return Err(err);
}
}
}

// The output array must be one bigger than the converted string,
// to leave room for the trailing null character.
if output_offset + 1 != N {
panic!("incorrect array length");
}

Ok(output)
}

/// Encode a string as UCS-2 with a trailing null character.
///
/// The encoding is done at compile time, so the result can be used in a
/// `const` item. The type returned by the macro is a `[u16; N]` array;
/// to avoid having to specify what `N` is in a `const` item, take a
/// reference and store it as `&[u16]`.
///
/// # Example
///
/// ```
/// use ucs2::ucs2_cstr;
///
/// const S: &[u16] = &ucs2_cstr!("abc");
/// assert_eq!(S, [97, 98, 99, 0]);
/// ```
#[macro_export]
macro_rules! ucs2_cstr {
($s:literal) => {{
// Use `const` values here to force errors to happen at compile
// time.

const NUM_CHARS: usize = match $crate::str_num_ucs2_chars($s) {
// Add one for the null char.
Ok(num) => num + 1,
Err(_) => panic!("input contains a character which cannot be represented in UCS-2"),
};

const VAL: [u16; NUM_CHARS] = match $crate::str_to_ucs2($s) {
Ok(val) => val,
// The string was already checked by `str_num_ucs2_chars`,
// so this error is unreachable.
Err(_) => {
unreachable!();
}
};
VAL
}};
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_str_num_chars() {
// Some of the strings here are from https://www.kermitproject.org/utf8.html.

// One-byte chars.
assert_eq!(str_num_ucs2_chars("abc"), Ok(3));
// Two-byte chars.
assert_eq!(str_num_ucs2_chars("Τη γλώσσα μου έδωσαν ελληνική"), Ok(29));
// Three-byte chars.
assert_eq!(str_num_ucs2_chars("ვეპხის ტყაოსანი შოთა რუსთაველი"), Ok(30));
// Four-byte chars.
assert_eq!(str_num_ucs2_chars("😎🔥"), Err(Error::MultiByte));
}

#[test]
fn test_ucs2_cstr() {
let s = ucs2_cstr!("abc");
assert_eq!(s, [97, 98, 99, 0]);
}
}
11 changes: 10 additions & 1 deletion tests/tests.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use ucs2::{decode, decode_with, encode, Error};
use ucs2::{decode, decode_with, encode, ucs2_cstr, Error};

#[test]
fn encoding() {
Expand Down Expand Up @@ -64,3 +64,12 @@ fn decoding_with() {
assert_eq!(result.unwrap(), 9);
assert_eq!(core::str::from_utf8(&u8_buffer[0..9]), Ok("$¢ह한"));
}

#[test]
fn test_macro() {
const S1: [u16; 4] = ucs2_cstr!("abc");
const S2: &[u16] = &ucs2_cstr!("$¢ह한");

assert_eq!(S1, [97, 98, 99, 0]);
assert_eq!(S2, [36, 162, 2361, 54620, 0]);
}