From 4acc5e6480e6eada1ad8d867ea31de9b998216a8 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 11 Dec 2023 16:35:43 +1100 Subject: [PATCH 1/5] Don't rebuild raw strings when unescaping. Raw strings don't have escape sequences, so for them "unescaping" just means checking for invalid chars like bare CR. Which means there is no need to rebuild them one char or byte at a time while escaping, because the unescaped version will be the same. This commit removes that rebuilding. Also, the commit changes things so that "unescaping" is unconditional for raw strings and raw byte strings. That's simpler and they're rare enough that the perf effect is negligible. --- compiler/rustc_ast/src/util/literal.rs | 73 +++++++++++--------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 50eb92125b945..76ddb6a39389f 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -77,6 +77,8 @@ impl LitKind { // new symbol because the string in the LitKind is different to the // string in the token. let s = symbol.as_str(); + // Vanilla strings are so common we optimize for the common case where no chars + // requiring special behaviour are present. let symbol = if s.contains(['\\', '\r']) { let mut buf = String::with_capacity(s.len()); let mut error = Ok(()); @@ -104,27 +106,20 @@ impl LitKind { LitKind::Str(symbol, ast::StrStyle::Cooked) } token::StrRaw(n) => { - // Ditto. - let s = symbol.as_str(); - let symbol = - if s.contains('\r') { - let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| { - match unescaped_char { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } - } + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can reuse the symbol on success. + let mut error = Ok(()); + unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| { + match unescaped_char { + Ok(_) => {} + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } - }); - error?; - Symbol::intern(&buf) - } else { - symbol - }; + } + } + }); + error?; LitKind::Str(symbol, ast::StrStyle::Raw(n)) } token::ByteStr => { @@ -143,25 +138,19 @@ impl LitKind { LitKind::ByteStr(buf.into(), StrStyle::Cooked) } token::ByteStrRaw(n) => { + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can convert the symbol directly to a `Lrc` on success. let s = symbol.as_str(); - let bytes = if s.contains('\r') { - let mut buf = Vec::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + let mut error = Ok(()); + unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { + Ok(_) => {} + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } - }); - error?; - buf - } else { - symbol.to_string().into_bytes() - }; - - LitKind::ByteStr(bytes.into(), StrStyle::Raw(n)) + } + }); + LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n)) } token::CStr => { let s = symbol.as_str(); @@ -187,18 +176,15 @@ impl LitKind { LitKind::CStr(buf.into(), StrStyle::Cooked) } token::CStrRaw(n) => { + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can convert the symbol directly to a `Lrc` on success. let s = symbol.as_str(); - let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c { Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => { error = Err(LitError::NulInCStr(span)); } - Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), - Ok(CStrUnit::Char(c)) => { - buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) - } + Ok(_) => {} Err(err) => { if err.is_fatal() { error = Err(LitError::LexerError); @@ -206,6 +192,7 @@ impl LitKind { } }); error?; + let mut buf = s.to_owned().into_bytes(); buf.push(0); LitKind::CStr(buf.into(), StrStyle::Raw(n)) } From 423bf4233d92dfaa4c97f04c4fc3a2368d880d24 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 11 Dec 2023 16:56:52 +1100 Subject: [PATCH 2/5] Rename the `span` args to `emit_unescape_error`. The `span` arg is described in a comment as "interior span of the literal, without quotes", which is incorrect. It's actually the span of the error part of the literal, corresponding to `range`. This commit renames `span` and `span_without_quotes` to make things clearer, and fixes the erroneous comment. --- .../src/lexer/unescape_error_reporting.rs | 75 +++++++++++-------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index b659c40b2336f..2b4c2e3c2501f 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -11,12 +11,12 @@ use crate::errors::{MoreThanOneCharNote, MoreThanOneCharSugg, NoBraceUnicodeSub, pub(crate) fn emit_unescape_error( handler: &Handler, - // interior part of the literal, without quotes + // interior part of the literal, between quotes lit: &str, - // full span of the literal, including quotes - span_with_quotes: Span, - // interior span of the literal, without quotes - span: Span, + // full span of the literal, including quotes and any prefix + full_lit_span: Span, + // span of the error part of the literal + err_span: Span, mode: Mode, // range of the error inside `lit` range: Range, @@ -24,19 +24,21 @@ pub(crate) fn emit_unescape_error( ) { debug!( "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}", - lit, span_with_quotes, mode, range, error + lit, full_lit_span, mode, range, error ); let last_char = || { let c = lit[range.clone()].chars().next_back().unwrap(); - let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32)); + let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32)); (c, span) }; match error { EscapeError::LoneSurrogateUnicodeEscape => { - handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true }); + handler + .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true }); } EscapeError::OutOfRangeUnicodeEscape => { - handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false }); + handler + .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false }); } EscapeError::MoreThanOneChar => { use unicode_normalization::{char::is_combining_mark, UnicodeNormalization}; @@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error( let normalized = lit.nfc().to_string(); if normalized.chars().count() == 1 { let ch = normalized.chars().next().unwrap().escape_default().to_string(); - sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized }); + sugg = Some(MoreThanOneCharSugg::NormalizedForm { + span: err_span, + ch, + normalized, + }); } let escaped_marks = rest.iter().map(|c| c.escape_default().to_string()).collect::>(); note = Some(MoreThanOneCharNote::AllCombining { - span, + span: err_span, chr: format!("{first}"), len: escaped_marks.len(), escaped_marks: escaped_marks.join(""), @@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error( .collect(); if let &[ch] = printable.as_slice() { - sugg = - Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() }); + sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting { + span: err_span, + ch: ch.to_string(), + }); note = Some(MoreThanOneCharNote::NonPrinting { - span, + span: err_span, escaped: lit.escape_default().to_string(), }); } @@ -91,13 +99,13 @@ pub(crate) fn emit_unescape_error( } let sugg = format!("{prefix}\"{escaped}\""); MoreThanOneCharSugg::Quotes { - span: span_with_quotes, + span: full_lit_span, is_byte: mode == Mode::Byte, sugg, } }); handler.emit_err(UnescapeError::MoreThanOneChar { - span: span_with_quotes, + span: full_lit_span, note, suggestion: sugg, }); @@ -105,7 +113,7 @@ pub(crate) fn emit_unescape_error( EscapeError::EscapeOnlyChar => { let (c, char_span) = last_char(); handler.emit_err(UnescapeError::EscapeOnlyChar { - span, + span: err_span, char_span, escaped_sugg: c.escape_default().to_string(), escaped_msg: escaped_char(c), @@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error( } EscapeError::BareCarriageReturn => { let double_quotes = mode.in_double_quotes(); - handler.emit_err(UnescapeError::BareCr { span, double_quotes }); + handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes }); } EscapeError::BareCarriageReturnInRawString => { assert!(mode.in_double_quotes()); - handler.emit_err(UnescapeError::BareCrRawString(span)); + handler.emit_err(UnescapeError::BareCrRawString(err_span)); } EscapeError::InvalidEscape => { let (c, span) = last_char(); @@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error( } else { if mode == Mode::Str || mode == Mode::Char { diag.span_suggestion( - span_with_quotes, + full_lit_span, "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", format!("r\"{lit}\""), Applicability::MaybeIncorrect, @@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error( diag.emit(); } EscapeError::TooShortHexEscape => { - handler.emit_err(UnescapeError::TooShortHexEscape(span)); + handler.emit_err(UnescapeError::TooShortHexEscape(err_span)); } EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => { let (c, span) = last_char(); @@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error( err.emit(); } EscapeError::OutOfRangeHexEscape => { - handler.emit_err(UnescapeError::OutOfRangeHexEscape(span)); + handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span)); } EscapeError::LeadingUnderscoreUnicodeEscape => { let (c, span) = last_char(); @@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error( }); } EscapeError::OverlongUnicodeEscape => { - handler.emit_err(UnescapeError::OverlongUnicodeEscape(span)); + handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span)); } EscapeError::UnclosedUnicodeEscape => { - handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi())); + handler + .emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi())); } EscapeError::NoBraceInUnicodeEscape => { let mut suggestion = "\\u{".to_owned(); @@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error( let (label, sub) = if suggestion_len > 0 { suggestion.push('}'); let hi = char_span.lo() + BytePos(suggestion_len as u32); - (None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion }) + (None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion }) } else { - (Some(span), NoBraceUnicodeSub::Help) + (Some(err_span), NoBraceUnicodeSub::Help) }; - handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub }); + handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub }); } EscapeError::UnicodeEscapeInByte => { - handler.emit_err(UnescapeError::UnicodeEscapeInByte(span)); + handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span)); } EscapeError::EmptyUnicodeEscape => { - handler.emit_err(UnescapeError::EmptyUnicodeEscape(span)); + handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span)); } EscapeError::ZeroChars => { - handler.emit_err(UnescapeError::ZeroChars(span)); + handler.emit_err(UnescapeError::ZeroChars(err_span)); } EscapeError::LoneSlash => { - handler.emit_err(UnescapeError::LoneSlash(span)); + handler.emit_err(UnescapeError::LoneSlash(err_span)); } EscapeError::UnskippedWhitespaceWarning => { let (c, char_span) = last_char(); handler.emit_warning(UnescapeError::UnskippedWhitespace { - span, + span: err_span, ch: escaped_char(c), char_span, }); } EscapeError::MultipleSkippedLinesWarning => { - handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span)); + handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span)); } } } From 29c5158ef5772351475d650ffbee417b36885766 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 13 Dec 2023 09:29:59 +1100 Subject: [PATCH 3/5] Adjust `Mode::is_unicode_escape_disallowed`. Some cases are unreachable. --- compiler/rustc_lexer/src/unescape.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 249126a269e23..f6216504e9808 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -191,8 +191,9 @@ impl Mode { /// Byte literals do not allow unicode escape. fn is_unicode_escape_disallowed(self) -> bool { match self { - Byte | ByteStr | RawByteStr => true, - Char | Str | RawStr | CStr | RawCStr => false, + Byte | ByteStr => true, + Char | Str | CStr => false, + RawByteStr | RawStr | RawCStr => unreachable!(), } } From a50efe265309d0fee0661121508f77d23aecd82e Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 13 Dec 2023 09:46:21 +1100 Subject: [PATCH 4/5] Unify single-char and multi-char `CStrUnit::Char` handling. The two cases are equivalent. C string literals aren't common so there is no performance need here. --- compiler/rustc_ast/src/util/literal.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 76ddb6a39389f..92b9adf1db751 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -161,7 +161,6 @@ impl LitKind { error = Err(LitError::NulInCStr(span)); } Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), Ok(CStrUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } From b900eb73173f416dba5619ebfc10e6e0438a4753 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 13 Dec 2023 12:50:39 +1100 Subject: [PATCH 5/5] Rename some unescaping functions. `unescape_raw_str_or_raw_byte_str` only does checking, no unescaping. And it also now handles C string literals. `unescape_raw_str` is used for all the non-raw strings. --- compiler/rustc_lexer/src/unescape.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index f6216504e9808..abec12f52a6e6 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -92,8 +92,8 @@ where let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); } - Str | ByteStr => unescape_str_common(src, mode, callback), - RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), + Str | ByteStr => unescape_non_raw_common(src, mode, callback), + RawStr | RawByteStr => check_raw_common(src, mode, callback), CStr | RawCStr => unreachable!(), } } @@ -122,12 +122,10 @@ where { match mode { CStr => { - unescape_str_common(src, mode, callback); + unescape_non_raw_common(src, mode, callback); } RawCStr => { - unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { - callback(r, result.map(CStrUnit::Char)) - }); + check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char))); } Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), } @@ -325,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result + From>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -392,7 +390,7 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) +fn check_raw_common(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -400,7 +398,7 @@ where let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop // The `start` and `end` computation here matches the one in - // `unescape_str_common` for consistency, even though this function + // `unescape_non_raw_common` for consistency, even though this function // doesn't have to worry about skipping any chars. while let Some(c) = chars.next() { let start = src.len() - chars.as_str().len() - c.len_utf8();