diff --git a/README.md b/README.md index b48ee8a914ec4..42fc0a63c0ffb 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ or reading the [rustc dev guide][rustcguidebuild]. [rustcguidebuild]: https://rustc-dev-guide.rust-lang.org/building/how-to-build-and-run.html -### Building on Unix-like system +### Building on a Unix-like system 1. Make sure you have installed the dependencies: * `g++` 5.1 or later or `clang++` 3.5 or later diff --git a/RELEASES.md b/RELEASES.md index 757821abcd153..fc9628bb365b4 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -43,7 +43,7 @@ Libraries - [Unicode 13 is now supported.][69929] - [`String` now implements `From<&mut str>`.][69661] - [`IoSlice` now implements `Copy`.][69403] -- [`Vec` now implements `From<[T; N]>`.][68692] Where `N` is less than 32. +- [`Vec` now implements `From<[T; N]>`.][68692] Where `N` is at most 32. - [`proc_macro::LexError` now implements `fmt::Display` and `Error`.][68899] - [`from_le_bytes`, `to_le_bytes`, `from_be_bytes`, `to_be_bytes`, `from_ne_bytes`, and `to_ne_bytes` methods are now `const` for all diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index c2139d07f378a..cf90c6d838635 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -29,7 +29,7 @@ mod tests; use self::LiteralKind::*; use self::TokenKind::*; use crate::cursor::{Cursor, EOF_CHAR}; -use std::convert::TryInto; +use std::convert::TryFrom; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -142,84 +142,24 @@ pub enum LiteralKind { /// "b"abc"", "b"abc" ByteStr { terminated: bool }, /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" - RawStr(UnvalidatedRawStr), + RawStr { n_hashes: u16, err: Option }, /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" - RawByteStr(UnvalidatedRawStr), -} - -/// Represents something that looks like a raw string, but may have some -/// problems. Use `.validate()` to convert it into something -/// usable. -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub struct UnvalidatedRawStr { - /// The prefix (`r###"`) is valid - valid_start: bool, - - /// The postfix (`"###`) is valid - valid_end: bool, - - /// The number of leading `#` - n_start_hashes: usize, - /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes` - n_end_hashes: usize, - /// The offset starting at `r` or `br` where the user may have intended to end the string. - /// Currently, it is the longest sequence of pattern `"#+"`. - possible_terminator_offset: Option, + RawByteStr { n_hashes: u16, err: Option }, } /// Error produced validating a raw string. Represents cases like: -/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter` -/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` -/// - Too many `#`s (>65536): `TooManyDelimiters` +/// - `r##~"abcde"##`: `InvalidStarter` +/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` +/// - Too many `#`s (>65535): `TooManyDelimiters` #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum LexRawStrError { +pub enum RawStrError { /// Non `#` characters exist between `r` and `"` eg. `r#~"..` - InvalidStarter, + InvalidStarter { bad_char: char }, /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they /// may have intended to terminate it. NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option }, - /// More than 65536 `#`s exist. - TooManyDelimiters, -} - -/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where -/// there are a matching number of `#` characters in both. Note that this will -/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a -/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token. -#[derive(Debug, Eq, PartialEq, Copy, Clone)] -pub struct ValidatedRawStr { - n_hashes: u16, -} - -impl ValidatedRawStr { - pub fn num_hashes(&self) -> u16 { - self.n_hashes - } -} - -impl UnvalidatedRawStr { - pub fn validate(self) -> Result { - if !self.valid_start { - return Err(LexRawStrError::InvalidStarter); - } - - // Only up to 65535 `#`s are allowed in raw strings - let n_start_safe: u16 = - self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?; - - if self.n_start_hashes > self.n_end_hashes || !self.valid_end { - Err(LexRawStrError::NoTerminator { - expected: self.n_start_hashes, - found: self.n_end_hashes, - possible_terminator_offset: self.possible_terminator_offset, - }) - } else { - // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end, - // they must be equal. - debug_assert_eq!(self.n_start_hashes, self.n_end_hashes); - Ok(ValidatedRawStr { n_hashes: n_start_safe }) - } - } + /// More than 65535 `#`s exist. + TooManyDelimiters { found: usize }, } /// Base of numeric literal encoding according to its prefix. @@ -354,12 +294,12 @@ impl Cursor<'_> { 'r' => match (self.first(), self.second()) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { - let raw_str_i = self.raw_double_quoted_string(1); + let (n_hashes, err) = self.raw_double_quoted_string(1); let suffix_start = self.len_consumed(); - if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes { + if err.is_none() { self.eat_literal_suffix(); } - let kind = RawStr(raw_str_i); + let kind = RawStr { n_hashes, err }; Literal { kind, suffix_start } } _ => self.ident(), @@ -389,14 +329,12 @@ impl Cursor<'_> { } ('r', '"') | ('r', '#') => { self.bump(); - let raw_str_i = self.raw_double_quoted_string(2); + let (n_hashes, err) = self.raw_double_quoted_string(2); let suffix_start = self.len_consumed(); - let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes; - if terminated { + if err.is_none() { self.eat_literal_suffix(); } - - let kind = RawByteStr(raw_str_i); + let kind = RawByteStr { n_hashes, err }; Literal { kind, suffix_start } } _ => self.ident(), @@ -692,27 +630,34 @@ impl Cursor<'_> { false } - /// Eats the double-quoted string and returns an `UnvalidatedRawStr`. - fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr { + /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. + fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option) { + // Wrap the actual function to handle the error with too many hashes. + // This way, it eats the whole raw string. + let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); + // Only up to 65535 `#`s are allowed in raw strings + match u16::try_from(n_hashes) { + Ok(num) => (num, err), + // We lie about the number of hashes here :P + Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), + } + } + + fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option) { debug_assert!(self.prev() == 'r'); - let mut valid_start: bool = false; let start_pos = self.len_consumed(); - let (mut possible_terminator_offset, mut max_hashes) = (None, 0); + let mut possible_terminator_offset = None; + let mut max_hashes = 0; // Count opening '#' symbols. let n_start_hashes = self.eat_while(|c| c == '#'); // Check that string is started. match self.bump() { - Some('"') => valid_start = true, - _ => { - return UnvalidatedRawStr { - valid_start, - valid_end: false, - n_start_hashes, - n_end_hashes: 0, - possible_terminator_offset, - }; + Some('"') => (), + c => { + let c = c.unwrap_or(EOF_CHAR); + return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); } } @@ -722,13 +667,14 @@ impl Cursor<'_> { self.eat_while(|c| c != '"'); if self.is_eof() { - return UnvalidatedRawStr { - valid_start, - valid_end: false, + return ( n_start_hashes, - n_end_hashes: max_hashes, - possible_terminator_offset, - }; + Some(RawStrError::NoTerminator { + expected: n_start_hashes, + found: max_hashes, + possible_terminator_offset, + }), + ); } // Eat closing double quote. @@ -737,7 +683,7 @@ impl Cursor<'_> { // Check that amount of closing '#' symbols // is equal to the amount of opening ones. // Note that this will not consume extra trailing `#` characters: - // `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }` + // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` // followed by a `#` token. let mut hashes_left = n_start_hashes; let is_closing_hash = |c| { @@ -751,13 +697,7 @@ impl Cursor<'_> { let n_end_hashes = self.eat_while(is_closing_hash); if n_end_hashes == n_start_hashes { - return UnvalidatedRawStr { - valid_start, - valid_end: true, - n_start_hashes, - n_end_hashes, - possible_terminator_offset: None, - }; + return (n_start_hashes, None); } else if n_end_hashes > max_hashes { // Keep track of possible terminators to give a hint about // where there might be a missing terminator diff --git a/src/librustc_lexer/src/tests.rs b/src/librustc_lexer/src/tests.rs index 725799374fc64..e6acc26ec2f34 100644 --- a/src/librustc_lexer/src/tests.rs +++ b/src/librustc_lexer/src/tests.rs @@ -2,77 +2,37 @@ mod tests { use crate::*; - fn check_raw_str( - s: &str, - expected: UnvalidatedRawStr, - validated: Result, - ) { + fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option) { let s = &format!("r{}", s); let mut cursor = Cursor::new(s); cursor.bump(); - let tok = cursor.raw_double_quoted_string(0); - assert_eq!(tok, expected); - assert_eq!(tok.validate(), validated); + let (n_hashes, err) = cursor.raw_double_quoted_string(0); + assert_eq!(n_hashes, expected_hashes); + assert_eq!(err, expected_err); } #[test] fn test_naked_raw_str() { - check_raw_str( - r#""abc""#, - UnvalidatedRawStr { - n_start_hashes: 0, - n_end_hashes: 0, - valid_start: true, - valid_end: true, - possible_terminator_offset: None, - }, - Ok(ValidatedRawStr { n_hashes: 0 }), - ); + check_raw_str(r#""abc""#, 0, None); } #[test] fn test_raw_no_start() { - check_raw_str( - r##""abc"#"##, - UnvalidatedRawStr { - n_start_hashes: 0, - n_end_hashes: 0, - valid_start: true, - valid_end: true, - possible_terminator_offset: None, - }, - Ok(ValidatedRawStr { n_hashes: 0 }), - ); + check_raw_str(r##""abc"#"##, 0, None); } #[test] fn test_too_many_terminators() { // this error is handled in the parser later - check_raw_str( - r###"#"abc"##"###, - UnvalidatedRawStr { - n_start_hashes: 1, - n_end_hashes: 1, - valid_end: true, - valid_start: true, - possible_terminator_offset: None, - }, - Ok(ValidatedRawStr { n_hashes: 1 }), - ); + check_raw_str(r###"#"abc"##"###, 1, None); } #[test] fn test_unterminated() { check_raw_str( r#"#"abc"#, - UnvalidatedRawStr { - n_start_hashes: 1, - n_end_hashes: 0, - valid_end: false, - valid_start: true, - possible_terminator_offset: None, - }, - Err(LexRawStrError::NoTerminator { + 1, + Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None, @@ -80,14 +40,8 @@ mod tests { ); check_raw_str( r###"##"abc"#"###, - UnvalidatedRawStr { - n_start_hashes: 2, - n_end_hashes: 1, - valid_start: true, - valid_end: false, - possible_terminator_offset: Some(7), - }, - Err(LexRawStrError::NoTerminator { + 2, + Some(RawStrError::NoTerminator { expected: 2, found: 1, possible_terminator_offset: Some(7), @@ -96,14 +50,8 @@ mod tests { // We're looking for "# not just any # check_raw_str( r###"##"abc#"###, - UnvalidatedRawStr { - n_start_hashes: 2, - n_end_hashes: 0, - valid_start: true, - valid_end: false, - possible_terminator_offset: None, - }, - Err(LexRawStrError::NoTerminator { + 2, + Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None, @@ -113,17 +61,7 @@ mod tests { #[test] fn test_invalid_start() { - check_raw_str( - r##"#~"abc"#"##, - UnvalidatedRawStr { - n_start_hashes: 1, - n_end_hashes: 0, - valid_start: false, - valid_end: false, - possible_terminator_offset: None, - }, - Err(LexRawStrError::InvalidStarter), - ); + check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' })); } #[test] @@ -131,14 +69,8 @@ mod tests { // https://github.com/rust-lang/rust/issues/70677 check_raw_str( r#"""#, - UnvalidatedRawStr { - n_start_hashes: 0, - n_end_hashes: 0, - valid_start: true, - valid_end: false, - possible_terminator_offset: None, - }, - Err(LexRawStrError::NoTerminator { + 0, + Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None, diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index 2b7d5e5adb432..7e59f06e44ae3 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -3,7 +3,7 @@ use rustc_ast::util::comments; use rustc_data_structures::sync::Lrc; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError}; use rustc_lexer::Base; -use rustc_lexer::{unescape, LexRawStrError, UnvalidatedRawStr, ValidatedRawStr}; +use rustc_lexer::{unescape, RawStrError}; use rustc_session::parse::ParseSess; use rustc_span::symbol::{sym, Symbol}; use rustc_span::{BytePos, Pos, Span}; @@ -49,13 +49,12 @@ impl<'a> StringReader<'a> { // Make sure external source is loaded first, before accessing it. // While this can't show up during normal parsing, `retokenize` may // be called with a source file from an external crate. - sess.source_map().ensure_source_file_source_present(source_file.clone()); + sess.source_map().ensure_source_file_source_present(Lrc::clone(&source_file)); - // FIXME(eddyb) use `Lrc` or similar to avoid cloning the `String`. let src = if let Some(src) = &source_file.src { - src.clone() + Lrc::clone(&src) } else if let Some(src) = source_file.external_src.borrow().get_source() { - src.clone() + Lrc::clone(&src) } else { sess.span_diagnostic .bug(&format!("cannot lex `source_file` without source: {}", source_file.name)); @@ -125,10 +124,7 @@ impl<'a> StringReader<'a> { debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start)); - // This could use `?`, but that makes code significantly (10-20%) slower. - // https://github.com/rust-lang/rust/issues/37939 let kind = self.cook_lexer_token(token.kind, start); - let span = self.mk_sp(start, self.pos); Token::new(kind, span) } @@ -153,15 +149,6 @@ impl<'a> StringReader<'a> { self.err_span(self.mk_sp(from_pos, to_pos), m) } - fn struct_span_fatal( - &self, - from_pos: BytePos, - to_pos: BytePos, - m: &str, - ) -> DiagnosticBuilder<'a> { - self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m) - } - fn struct_fatal_span_char( &self, from_pos: BytePos, @@ -359,15 +346,13 @@ impl<'a> StringReader<'a> { } (token::ByteStr, Mode::ByteStr, 2, 1) // b" " } - rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => { - let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); - let n_hashes = valid_raw_str.num_hashes(); + rustc_lexer::LiteralKind::RawStr { n_hashes, err } => { + self.report_raw_str_error(start, err); let n = u32::from(n_hashes); (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "## } - rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => { - let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); - let n_hashes = validated_raw_str.num_hashes(); + rustc_lexer::LiteralKind::RawByteStr { n_hashes, err } => { + self.report_raw_str_error(start, err); let n = u32::from(n_hashes); (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "## } @@ -382,12 +367,7 @@ impl<'a> StringReader<'a> { } rustc_lexer::LiteralKind::Float { base, empty_exponent } => { if empty_exponent { - let mut err = self.struct_span_fatal( - start, - self.pos, - "expected at least one digit in exponent", - ); - err.emit(); + self.err_span_(start, self.pos, "expected at least one digit in exponent"); } match base { @@ -459,33 +439,25 @@ impl<'a> StringReader<'a> { } } - fn validate_and_report_errors( - &self, - start: BytePos, - unvalidated_raw_str: UnvalidatedRawStr, - ) -> ValidatedRawStr { - match unvalidated_raw_str.validate() { - Err(LexRawStrError::InvalidStarter) => self.report_non_started_raw_string(start), - Err(LexRawStrError::NoTerminator { expected, found, possible_terminator_offset }) => { - self.report_unterminated_raw_string( - start, - expected, - possible_terminator_offset, - found, - ) + fn report_raw_str_error(&self, start: BytePos, opt_err: Option) { + match opt_err { + Some(RawStrError::InvalidStarter { bad_char }) => { + self.report_non_started_raw_string(start, bad_char) + } + Some(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self + .report_unterminated_raw_string(start, expected, possible_terminator_offset, found), + Some(RawStrError::TooManyDelimiters { found }) => { + self.report_too_many_hashes(start, found) } - Err(LexRawStrError::TooManyDelimiters) => self.report_too_many_hashes(start), - Ok(valid) => valid, + None => (), } } - fn report_non_started_raw_string(&self, start: BytePos) -> ! { - let bad_char = self.str_from(start).chars().last().unwrap(); + fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! { self.struct_fatal_span_char( start, self.pos, - "found invalid character; only `#` is allowed \ - in raw string delimitation", + "found invalid character; only `#` is allowed in raw string delimitation", bad_char, ) .emit(); @@ -530,11 +502,17 @@ impl<'a> StringReader<'a> { FatalError.raise() } - fn report_too_many_hashes(&self, start: BytePos) -> ! { + /// Note: It was decided to not add a test case, because it would be to big. + /// https://github.com/rust-lang/rust/pull/50296#issuecomment-392135180 + fn report_too_many_hashes(&self, start: BytePos, found: usize) -> ! { self.fatal_span_( start, self.pos, - "too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols", + &format!( + "too many `#` symbols: raw strings may be delimited \ + by up to 65535 `#` symbols, but found {}", + found + ), ) .raise(); } diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index ebe6d09023f42..717d2868abf98 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -1883,6 +1883,10 @@ pub trait BufRead: Read { /// /// If successful, this function will return the total number of bytes read. /// + /// This function is blocking and should be used carefully: it is possible for + /// an attacker to continuously send bytes without ever sending the delimiter + /// or EOF. + /// /// # Errors /// /// This function will ignore all instances of [`ErrorKind::Interrupted`] and @@ -1945,6 +1949,10 @@ pub trait BufRead: Read { /// /// If this function returns `Ok(0)`, the stream has reached EOF. /// + /// This function is blocking and should be used carefully: it is possible for + /// an attacker to continuously send bytes without ever sending a newline + /// or EOF. + /// /// # Errors /// /// This function has the same error semantics as [`read_until`] and will