Skip to content

Commit

Permalink
auto merge of #15777 : SimonSapin/rust/pub-ascii-maps, r=alexcrichton
Browse files Browse the repository at this point in the history
When dealing with HTTP request or responses, many tokens are case-insensitive in the ASCII range but the bytes from the network are not necessarily valid UTF-8.

**[breaking-change]** Rather than adding new very similar traits, this re-uses the `std::ascii::OwnedStrAsciiExt` and `std::ascii::StrAsciiExt` traits, but rename to remove `Str` since that does not apply for bytes.

This PR also makes `std::ascii::ASCII_UPPER_MAP` and `std::ascii::ASCII_LOWER_MAP`, the lookup table all these methods are based on, public. In case there is something else related to ASCII case we haven’t thought of yet, that can be implemented outside of libstd without duplicating the tables.

Although this is a breaking change, I thought this could do without an RFC since the relevant traits are not in the prelude.

r? @alexcrichton
  • Loading branch information
bors committed Jul 30, 2014
2 parents 774d5eb + 235bb3f commit 692077b
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 61 deletions.
2 changes: 1 addition & 1 deletion src/liblog/directive.rs
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::ascii::StrAsciiExt;
use std::ascii::AsciiExt;
use std::cmp;

#[deriving(Show, Clone)]
Expand Down
2 changes: 1 addition & 1 deletion src/librustc/lint/mod.rs
Expand Up @@ -31,7 +31,7 @@
#![macro_escape]

use std::hash;
use std::ascii::StrAsciiExt;
use std::ascii::AsciiExt;
use syntax::codemap::Span;
use syntax::visit::FnKind;
use syntax::ast;
Expand Down
153 changes: 95 additions & 58 deletions src/libstd/ascii.rs
Expand Up @@ -20,11 +20,19 @@ use iter::Iterator;
use mem;
use option::{Option, Some, None};
use slice::{ImmutableVector, MutableVector, Vector};
use str::{Str, StrAllocating, StrSlice};
use str::{Str, StrSlice};
use str;
use string::String;
use to_string::IntoStr;
use vec::Vec;

#[deprecated="this trait has been renamed to `AsciiExt`"]
pub use StrAsciiExt = self::AsciiExt;

#[deprecated="this trait has been renamed to `OwnedAsciiExt`"]
pub use OwnedStrAsciiExt = self::OwnedAsciiExt;


/// Datatype to hold one ascii character. It wraps a `u8`, with the highest bit always zero.
#[deriving(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Ascii { chr: u8 }
Expand Down Expand Up @@ -366,108 +374,133 @@ impl IntoBytes for Vec<Ascii> {
}
}


/// Extension methods for ASCII-subset only operations on owned strings
pub trait OwnedStrAsciiExt {
pub trait OwnedAsciiExt {
/// Convert the string to ASCII upper case:
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
fn into_ascii_upper(self) -> String;
fn into_ascii_upper(self) -> Self;

/// Convert the string to ASCII lower case:
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
fn into_ascii_lower(self) -> String;
fn into_ascii_lower(self) -> Self;
}

/// Extension methods for ASCII-subset only operations on string slices
pub trait StrAsciiExt {
pub trait AsciiExt<T> {
/// Makes a copy of the string in ASCII upper case:
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
fn to_ascii_upper(&self) -> String;
fn to_ascii_upper(&self) -> T;

/// Makes a copy of the string in ASCII lower case:
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
fn to_ascii_lower(&self) -> String;
fn to_ascii_lower(&self) -> T;

/// Check that two strings are an ASCII case-insensitive match.
/// Same as `to_ascii_lower(a) == to_ascii_lower(b)`,
/// but without allocating and copying temporary strings.
fn eq_ignore_ascii_case(&self, other: &str) -> bool;
fn eq_ignore_ascii_case(&self, other: Self) -> bool;
}

impl<'a> StrAsciiExt for &'a str {
impl<'a> AsciiExt<String> for &'a str {
#[inline]
fn to_ascii_upper(&self) -> String {
unsafe { str_copy_map_bytes(*self, ASCII_UPPER_MAP) }
// Vec<u8>::to_ascii_upper() preserves the UTF-8 invariant.
unsafe { str::raw::from_utf8_owned(self.as_bytes().to_ascii_upper()) }
}

#[inline]
fn to_ascii_lower(&self) -> String {
unsafe { str_copy_map_bytes(*self, ASCII_LOWER_MAP) }
// Vec<u8>::to_ascii_lower() preserves the UTF-8 invariant.
unsafe { str::raw::from_utf8_owned(self.as_bytes().to_ascii_lower()) }
}

#[inline]
fn eq_ignore_ascii_case(&self, other: &str) -> bool {
self.len() == other.len() &&
self.as_bytes().iter().zip(other.as_bytes().iter()).all(
|(byte_self, byte_other)| {
ASCII_LOWER_MAP[*byte_self as uint] ==
ASCII_LOWER_MAP[*byte_other as uint]
})
self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
}
}

impl OwnedStrAsciiExt for String {
impl OwnedAsciiExt for String {
#[inline]
fn into_ascii_upper(self) -> String {
unsafe { str_map_bytes(self, ASCII_UPPER_MAP) }
// Vec<u8>::into_ascii_upper() preserves the UTF-8 invariant.
unsafe { str::raw::from_utf8_owned(self.into_bytes().into_ascii_upper()) }
}

#[inline]
fn into_ascii_lower(self) -> String {
unsafe { str_map_bytes(self, ASCII_LOWER_MAP) }
// Vec<u8>::into_ascii_lower() preserves the UTF-8 invariant.
unsafe { str::raw::from_utf8_owned(self.into_bytes().into_ascii_lower()) }
}
}

#[inline]
unsafe fn str_map_bytes(string: String, map: &'static [u8]) -> String {
let mut bytes = string.into_bytes();
impl<'a> AsciiExt<Vec<u8>> for &'a [u8] {
#[inline]
fn to_ascii_upper(&self) -> Vec<u8> {
self.iter().map(|&byte| ASCII_UPPER_MAP[byte as uint]).collect()
}

for b in bytes.mut_iter() {
*b = map[*b as uint];
#[inline]
fn to_ascii_lower(&self) -> Vec<u8> {
self.iter().map(|&byte| ASCII_LOWER_MAP[byte as uint]).collect()
}

String::from_utf8(bytes).unwrap()
#[inline]
fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
self.len() == other.len() &&
self.iter().zip(other.iter()).all(
|(byte_self, byte_other)| {
ASCII_LOWER_MAP[*byte_self as uint] ==
ASCII_LOWER_MAP[*byte_other as uint]
})
}
}

#[inline]
unsafe fn str_copy_map_bytes(string: &str, map: &'static [u8]) -> String {
let mut s = String::from_str(string);
for b in s.as_mut_bytes().mut_iter() {
*b = map[*b as uint];
impl OwnedAsciiExt for Vec<u8> {
#[inline]
fn into_ascii_upper(mut self) -> Vec<u8> {
for byte in self.mut_iter() {
*byte = ASCII_UPPER_MAP[*byte as uint];
}
self
}

#[inline]
fn into_ascii_lower(mut self) -> Vec<u8> {
for byte in self.mut_iter() {
*byte = ASCII_LOWER_MAP[*byte as uint];
}
self
}
s.into_string()
}

static ASCII_LOWER_MAP: &'static [u8] = &[

pub static ASCII_LOWER_MAP: [u8, ..256] = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
b'@',

b'a', b'b', b'c', b'd', b'e', b'f', b'g',
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z',

b'[', b'\\', b']', b'^', b'_',
b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
Expand All @@ -486,23 +519,27 @@ static ASCII_LOWER_MAP: &'static [u8] = &[
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
];

static ASCII_UPPER_MAP: &'static [u8] = &[
pub static ASCII_UPPER_MAP: [u8, ..256] = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
b'`',

b'A', b'B', b'C', b'D', b'E', b'F', b'G',
b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
b'X', b'Y', b'Z',

b'{', b'|', b'}', b'~', 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
Expand Down
2 changes: 1 addition & 1 deletion src/test/run-pass/issue-10683.rs
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::ascii::StrAsciiExt;
use std::ascii::AsciiExt;

static NAME: &'static str = "hello world";

Expand Down

0 comments on commit 692077b

Please sign in to comment.