From 1f147a2ed7671cacd8ab423d8979a1ccfa4443ab Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Tue, 30 Nov 2021 16:06:58 +0100 Subject: [PATCH] Replace `nth_char(0)` with `next()` in `cursor.first()` and optimize the iterator returned by `tokenize(). This improves lexer performance by 35% --- compiler/rustc_lexer/src/cursor.rs | 33 +++++++++++++++++++----------- compiler/rustc_lexer/src/lib.rs | 20 +++++++----------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 297f3d19ca178..0ba6c56dbb501 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -2,10 +2,11 @@ use std::str::Chars; /// Peekable iterator over a char sequence. /// -/// Next characters can be peeked via `nth_char` method, +/// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. pub(crate) struct Cursor<'a> { initial_len: usize, + /// Iterator over chars. Slightly faster than a &str. chars: Chars<'a>, #[cfg(debug_assertions)] prev: char, @@ -37,22 +38,21 @@ impl<'a> Cursor<'a> { } } - /// Returns nth character relative to the current cursor position. + /// Peeks the next symbol from the input stream without consuming it. /// If requested position doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - fn nth_char(&self, n: usize) -> char { - self.chars().nth(n).unwrap_or(EOF_CHAR) - } - - /// Peeks the next symbol from the input stream without consuming it. pub(crate) fn first(&self) -> char { - self.nth_char(0) + // `.next()` optimizes better than `.nth(0)` + self.chars.clone().next().unwrap_or(EOF_CHAR) } /// Peeks the second symbol from the input stream without consuming it. pub(crate) fn second(&self) -> char { - self.nth_char(1) + // `.next()` optimizes better than `.nth(1)` + let mut iter = self.chars.clone(); + iter.next(); + iter.next().unwrap_or(EOF_CHAR) } /// Checks if there is nothing more to consume. @@ -65,9 +65,9 @@ impl<'a> Cursor<'a> { self.initial_len - self.chars.as_str().len() } - /// Returns a `Chars` iterator over the remaining characters. - fn chars(&self) -> Chars<'a> { - self.chars.clone() + /// Resets the number of bytes consumed to 0. + pub(crate) fn reset_len_consumed(&mut self) { + self.initial_len = self.chars.as_str().len(); } /// Moves to the next character. @@ -81,4 +81,13 @@ impl<'a> Cursor<'a> { Some(c) } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } } diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index b64a891cb2526..08cd2d29c410f 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -225,14 +225,15 @@ pub fn first_token(input: &str) -> Token { } /// Creates an iterator that produces tokens from the input string. -pub fn tokenize(mut input: &str) -> impl Iterator + '_ { +pub fn tokenize(input: &str) -> impl Iterator + '_ { + let mut cursor = Cursor::new(input); std::iter::from_fn(move || { - if input.is_empty() { - return None; + if cursor.is_eof() { + None + } else { + cursor.reset_len_consumed(); + Some(cursor.advance_token()) } - let token = first_token(input); - input = &input[token.len..]; - Some(token) }) } @@ -808,11 +809,4 @@ impl Cursor<'_> { self.eat_while(is_id_continue); } - - /// Eats symbols while predicate returns true or until the end of file is reached. - fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } }