Skip to content

Commit

Permalink
Auto merge of rust-lang#118484 - blyxxyz:os-str-slice, r=BurntSushi
Browse files Browse the repository at this point in the history
Add substring API for `OsStr`

This adds a method for taking a substring of an `OsStr`, which in combination with [`OsStr::as_encoded_bytes()`](https://doc.rust-lang.org/std/ffi/struct.OsStr.html#method.as_encoded_bytes) makes it possible to implement most string operations in safe code.

API:
```rust
impl OsStr {
    pub fn slice_encoded_bytes<R: ops::RangeBounds<usize>>(&self, range: R) -> &Self;
}
```
Motivation, examples and research at rust-lang/libs-team#306.

Tracking issue: rust-lang#118485

cc `@epage`
r? libs-api
  • Loading branch information
bors committed Dec 2, 2023
2 parents 3f1e30a + 729851e commit d5fab33
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 2 deletions.
82 changes: 80 additions & 2 deletions library/std/src/ffi/os_str.rs
Expand Up @@ -6,9 +6,10 @@ use crate::cmp;
use crate::collections::TryReserveError;
use crate::fmt;
use crate::hash::{Hash, Hasher};
use crate::ops;
use crate::ops::{self, Range};
use crate::rc::Rc;
use crate::str::FromStr;
use crate::slice;
use crate::str::{from_utf8 as str_from_utf8, FromStr};
use crate::sync::Arc;

use crate::sys::os_str::{Buf, Slice};
Expand Down Expand Up @@ -963,6 +964,83 @@ impl OsStr {
self.inner.as_encoded_bytes()
}

/// Takes a substring based on a range that corresponds to the return value of
/// [`OsStr::as_encoded_bytes`].
///
/// The range's start and end must lie on valid `OsStr` boundaries.
/// A valid `OsStr` boundary is one of:
/// - The start of the string
/// - The end of the string
/// - Immediately before a valid non-empty UTF-8 substring
/// - Immediately after a valid non-empty UTF-8 substring
///
/// # Panics
///
/// Panics if `range` does not lie on valid `OsStr` boundaries or if it
/// exceeds the end of the string.
///
/// # Example
///
/// ```
/// #![feature(os_str_slice)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("foo=bar");
/// let bytes = os_str.as_encoded_bytes();
/// if let Some(index) = bytes.iter().position(|b| *b == b'=') {
/// let key = os_str.slice_encoded_bytes(..index);
/// let value = os_str.slice_encoded_bytes(index + 1..);
/// assert_eq!(key, "foo");
/// assert_eq!(value, "bar");
/// }
/// ```
#[unstable(feature = "os_str_slice", issue = "118485")]
pub fn slice_encoded_bytes<R: ops::RangeBounds<usize>>(&self, range: R) -> &Self {
#[track_caller]
fn check_valid_boundary(bytes: &[u8], index: usize) {
if index == 0 || index == bytes.len() {
return;
}

// Fast path
if bytes[index - 1].is_ascii() || bytes[index].is_ascii() {
return;
}

let (before, after) = bytes.split_at(index);

// UTF-8 takes at most 4 bytes per codepoint, so we don't
// need to check more than that.
let after = after.get(..4).unwrap_or(after);
match str_from_utf8(after) {
Ok(_) => return,
Err(err) if err.valid_up_to() != 0 => return,
Err(_) => (),
}

for len in 2..=4.min(index) {
let before = &before[index - len..];
if str_from_utf8(before).is_ok() {
return;
}
}

panic!("byte index {index} is not an OsStr boundary");
}

let encoded_bytes = self.as_encoded_bytes();
let Range { start, end } = slice::range(range, ..encoded_bytes.len());
check_valid_boundary(encoded_bytes, start);
check_valid_boundary(encoded_bytes, end);

// SAFETY: `slice::range` ensures that `start` and `end` are valid
let slice = unsafe { encoded_bytes.get_unchecked(start..end) };

// SAFETY: `slice` comes from `self` and we validated the boundaries
unsafe { Self::from_encoded_bytes_unchecked(slice) }
}

/// Converts this string to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
Expand Down
50 changes: 50 additions & 0 deletions library/std/src/ffi/os_str/tests.rs
Expand Up @@ -177,3 +177,53 @@ fn into_rc() {
assert_eq!(&*rc2, os_str);
assert_eq!(&*arc2, os_str);
}

#[test]
fn slice_encoded_bytes() {
let os_str = OsStr::new("123θგ🦀");
// ASCII
let digits = os_str.slice_encoded_bytes(..3);
assert_eq!(digits, "123");
let three = os_str.slice_encoded_bytes(2..3);
assert_eq!(three, "3");
// 2-byte UTF-8
let theta = os_str.slice_encoded_bytes(3..5);
assert_eq!(theta, "θ");
// 3-byte UTF-8
let gani = os_str.slice_encoded_bytes(5..8);
assert_eq!(gani, "გ");
// 4-byte UTF-8
let crab = os_str.slice_encoded_bytes(8..);
assert_eq!(crab, "🦀");
}

#[test]
#[should_panic(expected = "byte index 2 is not an OsStr boundary")]
fn slice_mid_char() {
let crab = OsStr::new("🦀");
let _ = crab.slice_encoded_bytes(..2);
}

#[cfg(windows)]
#[test]
#[should_panic(expected = "byte index 3 is not an OsStr boundary")]
fn slice_between_surrogates() {
use crate::os::windows::ffi::OsStringExt;

let os_string = OsString::from_wide(&[0xD800, 0xD800]);
assert_eq!(os_string.as_encoded_bytes(), &[0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80]);
let _ = os_string.slice_encoded_bytes(..3);
}

#[cfg(windows)]
#[test]
fn slice_surrogate_edge() {
use crate::os::windows::ffi::OsStringExt;

let os_string = OsString::from_wide(&[0xD800]);
let mut with_crab = os_string.clone();
with_crab.push("🦀");

assert_eq!(with_crab.slice_encoded_bytes(..3), os_string);
assert_eq!(with_crab.slice_encoded_bytes(3..), "🦀");
}
1 change: 1 addition & 0 deletions library/std/src/lib.rs
Expand Up @@ -341,6 +341,7 @@
#![feature(round_ties_even)]
#![feature(slice_internals)]
#![feature(slice_ptr_get)]
#![feature(slice_range)]
#![feature(std_internals)]
#![feature(str_internals)]
#![feature(strict_provenance)]
Expand Down

0 comments on commit d5fab33

Please sign in to comment.