diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 50eb92125b945..92b9adf1db751 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -77,6 +77,8 @@ impl LitKind { // new symbol because the string in the LitKind is different to the // string in the token. let s = symbol.as_str(); + // Vanilla strings are so common we optimize for the common case where no chars + // requiring special behaviour are present. let symbol = if s.contains(['\\', '\r']) { let mut buf = String::with_capacity(s.len()); let mut error = Ok(()); @@ -104,27 +106,20 @@ impl LitKind { LitKind::Str(symbol, ast::StrStyle::Cooked) } token::StrRaw(n) => { - // Ditto. - let s = symbol.as_str(); - let symbol = - if s.contains('\r') { - let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| { - match unescaped_char { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } - } + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can reuse the symbol on success. + let mut error = Ok(()); + unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| { + match unescaped_char { + Ok(_) => {} + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } - }); - error?; - Symbol::intern(&buf) - } else { - symbol - }; + } + } + }); + error?; LitKind::Str(symbol, ast::StrStyle::Raw(n)) } token::ByteStr => { @@ -143,25 +138,19 @@ impl LitKind { LitKind::ByteStr(buf.into(), StrStyle::Cooked) } token::ByteStrRaw(n) => { + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can convert the symbol directly to a `Lrc` on success. let s = symbol.as_str(); - let bytes = if s.contains('\r') { - let mut buf = Vec::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + let mut error = Ok(()); + unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { + Ok(_) => {} + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } - }); - error?; - buf - } else { - symbol.to_string().into_bytes() - }; - - LitKind::ByteStr(bytes.into(), StrStyle::Raw(n)) + } + }); + LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n)) } token::CStr => { let s = symbol.as_str(); @@ -172,7 +161,6 @@ impl LitKind { error = Err(LitError::NulInCStr(span)); } Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), Ok(CStrUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } @@ -187,18 +175,15 @@ impl LitKind { LitKind::CStr(buf.into(), StrStyle::Cooked) } token::CStrRaw(n) => { + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can convert the symbol directly to a `Lrc` on success. let s = symbol.as_str(); - let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c { Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => { error = Err(LitError::NulInCStr(span)); } - Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), - Ok(CStrUnit::Char(c)) => { - buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) - } + Ok(_) => {} Err(err) => { if err.is_fatal() { error = Err(LitError::LexerError); @@ -206,6 +191,7 @@ impl LitKind { } }); error?; + let mut buf = s.to_owned().into_bytes(); buf.push(0); LitKind::CStr(buf.into(), StrStyle::Raw(n)) } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 249126a269e23..abec12f52a6e6 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -92,8 +92,8 @@ where let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); } - Str | ByteStr => unescape_str_common(src, mode, callback), - RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), + Str | ByteStr => unescape_non_raw_common(src, mode, callback), + RawStr | RawByteStr => check_raw_common(src, mode, callback), CStr | RawCStr => unreachable!(), } } @@ -122,12 +122,10 @@ where { match mode { CStr => { - unescape_str_common(src, mode, callback); + unescape_non_raw_common(src, mode, callback); } RawCStr => { - unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { - callback(r, result.map(CStrUnit::Char)) - }); + check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char))); } Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), } @@ -191,8 +189,9 @@ impl Mode { /// Byte literals do not allow unicode escape. fn is_unicode_escape_disallowed(self) -> bool { match self { - Byte | ByteStr | RawByteStr => true, - Char | Str | RawStr | CStr | RawCStr => false, + Byte | ByteStr => true, + Char | Str | CStr => false, + RawByteStr | RawStr | RawCStr => unreachable!(), } } @@ -324,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result + From>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -391,7 +390,7 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) +fn check_raw_common(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -399,7 +398,7 @@ where let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop // The `start` and `end` computation here matches the one in - // `unescape_str_common` for consistency, even though this function + // `unescape_non_raw_common` for consistency, even though this function // doesn't have to worry about skipping any chars. while let Some(c) = chars.next() { let start = src.len() - chars.as_str().len() - c.len_utf8(); diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index b659c40b2336f..2b4c2e3c2501f 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -11,12 +11,12 @@ use crate::errors::{MoreThanOneCharNote, MoreThanOneCharSugg, NoBraceUnicodeSub, pub(crate) fn emit_unescape_error( handler: &Handler, - // interior part of the literal, without quotes + // interior part of the literal, between quotes lit: &str, - // full span of the literal, including quotes - span_with_quotes: Span, - // interior span of the literal, without quotes - span: Span, + // full span of the literal, including quotes and any prefix + full_lit_span: Span, + // span of the error part of the literal + err_span: Span, mode: Mode, // range of the error inside `lit` range: Range, @@ -24,19 +24,21 @@ pub(crate) fn emit_unescape_error( ) { debug!( "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}", - lit, span_with_quotes, mode, range, error + lit, full_lit_span, mode, range, error ); let last_char = || { let c = lit[range.clone()].chars().next_back().unwrap(); - let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32)); + let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32)); (c, span) }; match error { EscapeError::LoneSurrogateUnicodeEscape => { - handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true }); + handler + .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true }); } EscapeError::OutOfRangeUnicodeEscape => { - handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false }); + handler + .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false }); } EscapeError::MoreThanOneChar => { use unicode_normalization::{char::is_combining_mark, UnicodeNormalization}; @@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error( let normalized = lit.nfc().to_string(); if normalized.chars().count() == 1 { let ch = normalized.chars().next().unwrap().escape_default().to_string(); - sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized }); + sugg = Some(MoreThanOneCharSugg::NormalizedForm { + span: err_span, + ch, + normalized, + }); } let escaped_marks = rest.iter().map(|c| c.escape_default().to_string()).collect::>(); note = Some(MoreThanOneCharNote::AllCombining { - span, + span: err_span, chr: format!("{first}"), len: escaped_marks.len(), escaped_marks: escaped_marks.join(""), @@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error( .collect(); if let &[ch] = printable.as_slice() { - sugg = - Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() }); + sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting { + span: err_span, + ch: ch.to_string(), + }); note = Some(MoreThanOneCharNote::NonPrinting { - span, + span: err_span, escaped: lit.escape_default().to_string(), }); } @@ -91,13 +99,13 @@ pub(crate) fn emit_unescape_error( } let sugg = format!("{prefix}\"{escaped}\""); MoreThanOneCharSugg::Quotes { - span: span_with_quotes, + span: full_lit_span, is_byte: mode == Mode::Byte, sugg, } }); handler.emit_err(UnescapeError::MoreThanOneChar { - span: span_with_quotes, + span: full_lit_span, note, suggestion: sugg, }); @@ -105,7 +113,7 @@ pub(crate) fn emit_unescape_error( EscapeError::EscapeOnlyChar => { let (c, char_span) = last_char(); handler.emit_err(UnescapeError::EscapeOnlyChar { - span, + span: err_span, char_span, escaped_sugg: c.escape_default().to_string(), escaped_msg: escaped_char(c), @@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error( } EscapeError::BareCarriageReturn => { let double_quotes = mode.in_double_quotes(); - handler.emit_err(UnescapeError::BareCr { span, double_quotes }); + handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes }); } EscapeError::BareCarriageReturnInRawString => { assert!(mode.in_double_quotes()); - handler.emit_err(UnescapeError::BareCrRawString(span)); + handler.emit_err(UnescapeError::BareCrRawString(err_span)); } EscapeError::InvalidEscape => { let (c, span) = last_char(); @@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error( } else { if mode == Mode::Str || mode == Mode::Char { diag.span_suggestion( - span_with_quotes, + full_lit_span, "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", format!("r\"{lit}\""), Applicability::MaybeIncorrect, @@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error( diag.emit(); } EscapeError::TooShortHexEscape => { - handler.emit_err(UnescapeError::TooShortHexEscape(span)); + handler.emit_err(UnescapeError::TooShortHexEscape(err_span)); } EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => { let (c, span) = last_char(); @@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error( err.emit(); } EscapeError::OutOfRangeHexEscape => { - handler.emit_err(UnescapeError::OutOfRangeHexEscape(span)); + handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span)); } EscapeError::LeadingUnderscoreUnicodeEscape => { let (c, span) = last_char(); @@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error( }); } EscapeError::OverlongUnicodeEscape => { - handler.emit_err(UnescapeError::OverlongUnicodeEscape(span)); + handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span)); } EscapeError::UnclosedUnicodeEscape => { - handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi())); + handler + .emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi())); } EscapeError::NoBraceInUnicodeEscape => { let mut suggestion = "\\u{".to_owned(); @@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error( let (label, sub) = if suggestion_len > 0 { suggestion.push('}'); let hi = char_span.lo() + BytePos(suggestion_len as u32); - (None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion }) + (None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion }) } else { - (Some(span), NoBraceUnicodeSub::Help) + (Some(err_span), NoBraceUnicodeSub::Help) }; - handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub }); + handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub }); } EscapeError::UnicodeEscapeInByte => { - handler.emit_err(UnescapeError::UnicodeEscapeInByte(span)); + handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span)); } EscapeError::EmptyUnicodeEscape => { - handler.emit_err(UnescapeError::EmptyUnicodeEscape(span)); + handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span)); } EscapeError::ZeroChars => { - handler.emit_err(UnescapeError::ZeroChars(span)); + handler.emit_err(UnescapeError::ZeroChars(err_span)); } EscapeError::LoneSlash => { - handler.emit_err(UnescapeError::LoneSlash(span)); + handler.emit_err(UnescapeError::LoneSlash(err_span)); } EscapeError::UnskippedWhitespaceWarning => { let (c, char_span) = last_char(); handler.emit_warning(UnescapeError::UnskippedWhitespace { - span, + span: err_span, ch: escaped_char(c), char_span, }); } EscapeError::MultipleSkippedLinesWarning => { - handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span)); + handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span)); } } }