rust-osdev · phip1611 · Apr 2, 2024 · Mar 24, 2024 · Mar 24, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,3 +3,4 @@
 * Impl `Display`, `Eq`, `PartialEq`, `Ord`, `PartialOrd`, and `Hash` for
   the `Error` type.
 * Switch to the 2021 edition.
+* Add `ucs2_cstr!` macro.
diff --git a/src/lib.rs b/src/lib.rs
@@ -4,6 +4,13 @@
 #![deny(missing_docs)]
 #![deny(clippy::all)]
 
+mod macros;
+
+/// These need to be public for the `ucs2_cstr!` macro, but are not
+/// intended to be called directly.
+#[doc(hidden)]
+pub use macros::{str_num_ucs2_chars, str_to_ucs2};
+
 use bit_field::BitField;
 use core::fmt::{self, Display, Formatter};
 
@@ -29,6 +36,66 @@ impl Display for Error {
 
 type Result<T> = core::result::Result<T, Error>;
 
+/// Value returned by `ucs2_from_utf8_at_offset`.
+struct Ucs2CharFromUtf8 {
+    /// UCS-2 character.
+    val: u16,
+    /// Number of bytes needed to encode the character in UTF-8.
+    num_bytes: u8,
+}
+
+/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
+///
+/// # Safety
+///
+/// The input `bytes` must be valid UTF-8.
+const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
+    let len = bytes.len();
+    let ch;
+    let ch_len;
+
+    if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
+        ch = bytes[offset] as u16;
+        ch_len = 1;
+    } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
+        // 2 byte codepoint
+        if offset + 1 >= len {
+            // safe: len is the length of bytes,
+            // and bytes is a direct view into the
+            // buffer of input, which in order to be a valid
+            // utf-8 string _must_ contain `i + 1`.
+            unsafe { core::hint::unreachable_unchecked() }
+        }
+
+        let a = (bytes[offset] & 0b0001_1111) as u16;
+        let b = (bytes[offset + 1] & 0b0011_1111) as u16;
+        ch = a << 6 | b;
+        ch_len = 2;
+    } else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
+        // 3 byte codepoint
+        if offset + 2 >= len || offset + 1 >= len {
+            // safe: impossible utf-8 string.
+            unsafe { core::hint::unreachable_unchecked() }
+        }
+
+        let a = (bytes[offset] & 0b0000_1111) as u16;
+        let b = (bytes[offset + 1] & 0b0011_1111) as u16;
+        let c = (bytes[offset + 2] & 0b0011_1111) as u16;
+        ch = a << 12 | b << 6 | c;
+        ch_len = 3;
+    } else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
+        return Err(Error::MultiByte); // UTF-16
+    } else {
+        // safe: impossible utf-8 string.
+        unsafe { core::hint::unreachable_unchecked() }
+    }
+
+    Ok(Ucs2CharFromUtf8 {
+        val: ch,
+        num_bytes: ch_len,
+    })
+}
+
 /// Encodes an input UTF-8 string into a UCS-2 string.
 ///
 /// The returned `usize` represents the length of the returned buffer,
@@ -62,44 +129,10 @@ where
     let mut i = 0;
 
     while i < len {
-        let ch;
-
-        if bytes[i] & 0b1000_0000 == 0b0000_0000 {
-            ch = u16::from(bytes[i]);
-            i += 1;
-        } else if bytes[i] & 0b1110_0000 == 0b1100_0000 {
-            // 2 byte codepoint
-            if i + 1 >= len {
-                // safe: len is the length of bytes,
-                // and bytes is a direct view into the
-                // buffer of input, which in order to be a valid
-                // utf-8 string _must_ contain `i + 1`.
-                unsafe { core::hint::unreachable_unchecked() }
-            }
-
-            let a = u16::from(bytes[i] & 0b0001_1111);
-            let b = u16::from(bytes[i + 1] & 0b0011_1111);
-            ch = a << 6 | b;
-            i += 2;
-        } else if bytes[i] & 0b1111_0000 == 0b1110_0000 {
-            // 3 byte codepoint
-            if i + 2 >= len || i + 1 >= len {
-                // safe: impossible utf-8 string.
-                unsafe { core::hint::unreachable_unchecked() }
-            }
-
-            let a = u16::from(bytes[i] & 0b0000_1111);
-            let b = u16::from(bytes[i + 1] & 0b0011_1111);
-            let c = u16::from(bytes[i + 2] & 0b0011_1111);
-            ch = a << 12 | b << 6 | c;
-            i += 3;
-        } else if bytes[i] & 0b1111_0000 == 0b1111_0000 {
-            return Err(Error::MultiByte); // UTF-16
-        } else {
-            // safe: impossible utf-8 string.
-            unsafe { core::hint::unreachable_unchecked() }
-        }
-        output(ch)?;
+        // SAFETY: `bytes` is valid UTF-8.
+        let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
+        i += usize::from(ch.num_bytes);
+        output(ch.val)?;
     }
     Ok(())
 }

diff --git a/src/macros.rs b/src/macros.rs
@@ -0,0 +1,126 @@
+use crate::{ucs2_from_utf8_at_offset, Error};
+
+/// Count the number of UCS-2 characters in a string. Return an error if
+/// the string cannot be encoded in UCS-2.
+pub const fn str_num_ucs2_chars(s: &str) -> Result<usize, Error> {
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+
+    let mut offset = 0;
+    let mut num_ucs2_chars = 0;
+
+    while offset < len {
+        // SAFETY: `bytes` is valid UTF-8.
+        match unsafe { ucs2_from_utf8_at_offset(bytes, offset) } {
+            Ok(ch) => {
+                offset += ch.num_bytes as usize;
+                num_ucs2_chars += 1;
+            }
+            Err(err) => {
+                return Err(err);
+            }
+        }
+    }
+
+    Ok(num_ucs2_chars)
+}
+
+/// Convert a `str` into a null-terminated UCS-2 character array.
+pub const fn str_to_ucs2<const N: usize>(s: &str) -> Result<[u16; N], Error> {
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+
+    let mut output = [0; N];
+
+    let mut output_offset = 0;
+    let mut input_offset = 0;
+    while input_offset < len {
+        // SAFETY: `bytes` is valid UTF-8.
+        match unsafe { ucs2_from_utf8_at_offset(bytes, input_offset) } {
+            Ok(ch) => {
+                if ch.val == 0 {
+                    panic!("interior null character");
+                } else {
+                    output[output_offset] = ch.val;
+                    output_offset += 1;
+                    input_offset += ch.num_bytes as usize;
+                }
+            }
+            Err(err) => {
+                return Err(err);
+            }
+        }
+    }
+
+    // The output array must be one bigger than the converted string,
+    // to leave room for the trailing null character.
+    if output_offset + 1 != N {
+        panic!("incorrect array length");
+    }
+
+    Ok(output)
+}
+
+/// Encode a string as UCS-2 with a trailing null character.
+///
+/// The encoding is done at compile time, so the result can be used in a
+/// `const` item. The type returned by the macro is a `[u16; N]` array;
+/// to avoid having to specify what `N` is in a `const` item, take a
+/// reference and store it as `&[u16]`.
+///
+/// # Example
+///
+/// ```
+/// use ucs2::ucs2_cstr;
+///
+/// const S: &[u16] = &ucs2_cstr!("abc");
+/// assert_eq!(S, [97, 98, 99, 0]);
+/// ```
+#[macro_export]
+macro_rules! ucs2_cstr {
+    ($s:literal) => {{
+        // Use `const` values here to force errors to happen at compile
+        // time.
+
+        const NUM_CHARS: usize = match $crate::str_num_ucs2_chars($s) {
+            // Add one for the null char.
+            Ok(num) => num + 1,
+            Err(_) => panic!("input contains a character which cannot be represented in UCS-2"),
+        };
+
+        const VAL: [u16; NUM_CHARS] = match $crate::str_to_ucs2($s) {
+            Ok(val) => val,
+            // The string was already checked by `str_num_ucs2_chars`,
+            // so this error is unreachable.
+            Err(_) => {
+                unreachable!();
+            }
+        };
+        VAL
+    }};
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_str_num_chars() {
+        // Some of the strings here are from https://www.kermitproject.org/utf8.html.
+
+        // One-byte chars.
+        assert_eq!(str_num_ucs2_chars("abc"), Ok(3));
+        // Two-byte chars.
+        assert_eq!(str_num_ucs2_chars("Τη γλώσσα μου έδωσαν ελληνική"), Ok(29));
+        // Three-byte chars.
+        assert_eq!(str_num_ucs2_chars("ვეპხის ტყაოსანი შოთა რუსთაველი"), Ok(30));
+        // Four-byte chars.
+        assert_eq!(str_num_ucs2_chars("😎🔥"), Err(Error::MultiByte));
+    }
+
+    #[test]
+    fn test_ucs2_cstr() {
+        let s = ucs2_cstr!("abc");
+        assert_eq!(s, [97, 98, 99, 0]);
+    }
+}
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -1,4 +1,4 @@
-use ucs2::{decode, decode_with, encode, Error};
+use ucs2::{decode, decode_with, encode, ucs2_cstr, Error};
 
 #[test]
 fn encoding() {
@@ -64,3 +64,12 @@ fn decoding_with() {
     assert_eq!(result.unwrap(), 9);
     assert_eq!(core::str::from_utf8(&u8_buffer[0..9]), Ok("$¢ह한"));
 }
+
+#[test]
+fn test_macro() {
+    const S1: [u16; 4] = ucs2_cstr!("abc");
+    const S2: &[u16] = &ucs2_cstr!("$¢ह한");
+
+    assert_eq!(S1, [97, 98, 99, 0]);
+    assert_eq!(S2, [36, 162, 2361, 54620, 0]);
+}