diff --git a/Cargo.lock b/Cargo.lock index 55e5bdc138d9..027d38df2cdf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1579,6 +1579,7 @@ dependencies = [ "rustc-literal-escaper 0.0.4", "stdx", "tracing", + "winnow", ] [[package]] diff --git a/crates/parser/Cargo.toml b/crates/parser/Cargo.toml index c7da654de6d9..8384d5bec21a 100644 --- a/crates/parser/Cargo.toml +++ b/crates/parser/Cargo.toml @@ -19,6 +19,7 @@ rustc-literal-escaper.workspace = true tracing = { workspace = true, optional = true } edition.workspace = true +winnow = { version = "0.7.13", default-features = false } [dev-dependencies] expect-test = "1.5.1" diff --git a/crates/parser/src/frontmatter.rs b/crates/parser/src/frontmatter.rs new file mode 100644 index 000000000000..2747db4327c5 --- /dev/null +++ b/crates/parser/src/frontmatter.rs @@ -0,0 +1,348 @@ +// Copied from https://github.com/rust-lang/cargo/blob/367fd9f213750cd40317803dd0a5a3ce3f0c676d/src/cargo/util/frontmatter.rs +#![expect(dead_code)] // avoid editing +#![expect(unreachable_pub)] // avoid editing +#![expect(clippy::useless_format)] // avoid editing + +type Span = std::ops::Range; + +#[derive(Debug)] +pub struct ScriptSource<'s> { + /// The full file + raw: &'s str, + /// The `#!/usr/bin/env cargo` line, if present + shebang: Option, + /// The code fence opener (`---`) + open: Option, + /// Trailing text after `ScriptSource::open` that identifies the meaning of + /// `ScriptSource::frontmatter` + info: Option, + /// The lines between `ScriptSource::open` and `ScriptSource::close` + frontmatter: Option, + /// The code fence closer (`---`) + close: Option, + /// All content after the frontmatter and shebang + content: Span, +} + +impl<'s> ScriptSource<'s> { + pub fn parse(raw: &'s str) -> Result { + use winnow::stream::FindSlice as _; + use winnow::stream::Location as _; + use winnow::stream::Offset as _; + use winnow::stream::Stream as _; + + let content_end = raw.len(); + let mut source = Self { + raw, + shebang: None, + open: None, + info: None, + frontmatter: None, + close: None, + content: 0..content_end, + }; + + let mut input = winnow::stream::LocatingSlice::new(raw); + + if let Some(shebang_end) = strip_shebang(input.as_ref()) { + let shebang_start = input.current_token_start(); + let _ = input.next_slice(shebang_end); + let shebang_end = input.current_token_start(); + source.shebang = Some(shebang_start..shebang_end); + source.content = shebang_end..content_end; + } + + // Whitespace may precede a frontmatter but must end with a newline + if let Some(nl_end) = strip_ws_lines(input.as_ref()) { + let _ = input.next_slice(nl_end); + } + + // Opens with a line that starts with 3 or more `-` followed by an optional identifier + const FENCE_CHAR: char = '-'; + let fence_length = input + .as_ref() + .char_indices() + .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)) + .unwrap_or_else(|| input.eof_offset()); + let open_start = input.current_token_start(); + let fence_pattern = input.next_slice(fence_length); + let open_end = input.current_token_start(); + match fence_length { + 0 => { + return Ok(source); + } + 1 | 2 => { + // either not a frontmatter or invalid frontmatter opening + return Err(FrontmatterError::new( + format!( + "found {fence_length} `{FENCE_CHAR}` in rust frontmatter, expected at least 3" + ), + raw.len()..raw.len(), + ).push_visible_span(open_start..open_end)); + } + _ => {} + } + source.open = Some(open_start..open_end); + let Some(info_nl) = input.find_slice("\n") else { + return Err(FrontmatterError::new( + format!("unclosed frontmatter; expected `{fence_pattern}`"), + raw.len()..raw.len(), + ) + .push_visible_span(open_start..open_end)); + }; + let info = input.next_slice(info_nl.start); + let info = info.strip_suffix('\r').unwrap_or(info); // already excludes `\n` + let info = info.trim_matches(is_horizontal_whitespace); + if !info.is_empty() { + let info_start = info.offset_from(&raw); + let info_end = info_start + info.len(); + source.info = Some(info_start..info_end); + } + + // Ends with a line that starts with a matching number of `-` only followed by whitespace + let nl_fence_pattern = format!("\n{fence_pattern}"); + let Some(frontmatter_nl) = input.find_slice(nl_fence_pattern.as_str()) else { + for len in (2..(nl_fence_pattern.len() - 1)).rev() { + let Some(frontmatter_nl) = input.find_slice(&nl_fence_pattern[0..len]) else { + continue; + }; + let _ = input.next_slice(frontmatter_nl.start + 1); + let close_start = input.current_token_start(); + let _ = input.next_slice(len); + let close_end = input.current_token_start(); + let fewer_dashes = fence_length - len; + return Err(FrontmatterError::new( + format!( + "closing code fence has {fewer_dashes} less `-` than the opening fence" + ), + close_start..close_end, + ) + .push_visible_span(open_start..open_end)); + } + return Err(FrontmatterError::new( + format!("unclosed frontmatter; expected `{fence_pattern}`"), + raw.len()..raw.len(), + ) + .push_visible_span(open_start..open_end)); + }; + let frontmatter_start = input.current_token_start() + 1; // skip nl from infostring + let _ = input.next_slice(frontmatter_nl.start + 1); + let frontmatter_end = input.current_token_start(); + source.frontmatter = Some(frontmatter_start..frontmatter_end); + let close_start = input.current_token_start(); + let _ = input.next_slice(fence_length); + let close_end = input.current_token_start(); + source.close = Some(close_start..close_end); + + let nl = input.find_slice("\n"); + let after_closing_fence = + input.next_slice(nl.map(|span| span.end).unwrap_or_else(|| input.eof_offset())); + let content_start = input.current_token_start(); + let extra_dashes = after_closing_fence.chars().take_while(|b| *b == FENCE_CHAR).count(); + if 0 < extra_dashes { + let extra_start = close_end; + let extra_end = extra_start + extra_dashes; + return Err(FrontmatterError::new( + format!("closing code fence has {extra_dashes} more `-` than the opening fence"), + extra_start..extra_end, + ) + .push_visible_span(open_start..open_end)); + } else { + let after_closing_fence = strip_newline(after_closing_fence); + let after_closing_fence = after_closing_fence.trim_matches(is_horizontal_whitespace); + if !after_closing_fence.is_empty() { + // extra characters beyond the original fence pattern + let after_start = after_closing_fence.offset_from(&raw); + let after_end = after_start + after_closing_fence.len(); + return Err(FrontmatterError::new( + format!("unexpected characters after frontmatter close"), + after_start..after_end, + ) + .push_visible_span(open_start..open_end)); + } + } + + source.content = content_start..content_end; + + if let Some(nl_end) = strip_ws_lines(input.as_ref()) { + let _ = input.next_slice(nl_end); + } + let fence_length = input + .as_ref() + .char_indices() + .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)) + .unwrap_or_else(|| input.eof_offset()); + if 0 < fence_length { + let fence_start = input.current_token_start(); + let fence_end = fence_start + fence_length; + return Err(FrontmatterError::new( + format!("only one frontmatter is supported"), + fence_start..fence_end, + ) + .push_visible_span(open_start..open_end) + .push_visible_span(close_start..close_end)); + } + + Ok(source) + } + + pub fn shebang(&self) -> Option<&'s str> { + self.shebang.clone().map(|span| &self.raw[span]) + } + + pub fn shebang_span(&self) -> Option { + self.shebang.clone() + } + + pub fn open_span(&self) -> Option { + self.open.clone() + } + + pub fn info(&self) -> Option<&'s str> { + self.info.clone().map(|span| &self.raw[span]) + } + + pub fn info_span(&self) -> Option { + self.info.clone() + } + + pub fn frontmatter(&self) -> Option<&'s str> { + self.frontmatter.clone().map(|span| &self.raw[span]) + } + + pub fn frontmatter_span(&self) -> Option { + self.frontmatter.clone() + } + + pub fn close_span(&self) -> Option { + self.close.clone() + } + + pub fn content(&self) -> &'s str { + &self.raw[self.content.clone()] + } + + pub fn content_span(&self) -> Span { + self.content.clone() + } +} + +/// Returns the index after the shebang line, if present +pub fn strip_shebang(input: &str) -> Option { + // See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang` + // Shebang must start with `#!` literally, without any preceding whitespace. + // For simplicity we consider any line starting with `#!` a shebang, + // regardless of restrictions put on shebangs by specific platforms. + if let Some(rest) = input.strip_prefix("#!") { + // Ok, this is a shebang but if the next non-whitespace token is `[`, + // then it may be valid Rust code, so consider it Rust code. + // + // NOTE: rustc considers line and block comments to be whitespace but to avoid + // any more awareness of Rust grammar, we are excluding it. + if !rest.trim_start().starts_with('[') { + // No other choice than to consider this a shebang. + let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len()); + return Some(newline_end); + } + } + None +} + +/// Returns the index after any lines with only whitespace, if present +pub fn strip_ws_lines(input: &str) -> Option { + let ws_end = input.find(|c| !is_whitespace(c)).unwrap_or(input.len()); + if ws_end == 0 { + return None; + } + + let nl_start = input[0..ws_end].rfind('\n')?; + let nl_end = nl_start + 1; + Some(nl_end) +} + +/// True if `c` is considered a whitespace according to Rust language definition. +/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) +/// for definitions of these classes. +fn is_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + matches!( + c, + // End-of-line characters + | '\u{000A}' // line feed (\n) + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // carriage return (\r) + | '\u{0085}' // next line (from latin1) + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + + // `Default_Ignorable_Code_Point` characters + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Horizontal space characters + | '\u{0009}' // tab (\t) + | '\u{0020}' // space + ) +} + +/// True if `c` is considered horizontal whitespace according to Rust language definition. +fn is_horizontal_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + matches!( + c, + // Horizontal space characters + '\u{0009}' // tab (\t) + | '\u{0020}' // space + ) +} + +fn strip_newline(text: &str) -> &str { + text.strip_suffix("\r\n").or_else(|| text.strip_suffix('\n')).unwrap_or(text) +} + +#[derive(Debug)] +pub struct FrontmatterError { + message: String, + primary_span: Span, + visible_spans: Vec, +} + +impl FrontmatterError { + pub fn new(message: impl Into, span: Span) -> Self { + Self { message: message.into(), primary_span: span, visible_spans: Vec::new() } + } + + pub fn push_visible_span(mut self, span: Span) -> Self { + self.visible_spans.push(span); + self + } + + pub fn message(&self) -> &str { + self.message.as_str() + } + + pub fn primary_span(&self) -> Span { + self.primary_span.clone() + } + + pub fn visible_spans(&self) -> &[Span] { + &self.visible_spans + } +} + +impl std::fmt::Display for FrontmatterError { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.message.fmt(fmt) + } +} + +impl std::error::Error for FrontmatterError {} diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index edc3f406a67e..7c78ba8faf5f 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -37,10 +37,17 @@ impl<'a> LexedStr<'a> { pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> { let _p = tracing::info_span!("LexedStr::new").entered(); let mut conv = Converter::new(edition, text); - if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { - conv.res.push(SHEBANG, conv.offset); - conv.offset = shebang_len; - }; + if let Ok(script) = crate::frontmatter::ScriptSource::parse(text) { + if let Some(shebang) = script.shebang_span() { + conv.push(SHEBANG, shebang.end - shebang.start, Vec::new()); + } + if script.frontmatter().is_some() { + conv.push(FRONTMATTER, script.content_span().start - conv.offset, Vec::new()); + } + } else if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { + // Leave error reporting to `rustc_lexer` + conv.push(SHEBANG, shebang_len, Vec::new()); + } // Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer // but we want to split it to two in edition <2024. diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 7963f00bb25c..53444ef52cff 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -26,6 +26,7 @@ extern crate ra_ap_rustc_lexer as rustc_lexer; extern crate rustc_lexer; mod event; +mod frontmatter; mod grammar; mod input; mod lexed_str; diff --git a/crates/parser/test_data/lexer/ok/frontmatter.rast b/crates/parser/test_data/lexer/ok/frontmatter.rast new file mode 100644 index 000000000000..2c7d3cdb1227 --- /dev/null +++ b/crates/parser/test_data/lexer/ok/frontmatter.rast @@ -0,0 +1,12 @@ +FRONTMATTER "\n---\n[dependencies]\nclap = \"4\"\n---\n" +WHITESPACE "\n" +FN_KW "fn" +WHITESPACE " " +IDENT "main" +L_PAREN "(" +R_PAREN ")" +WHITESPACE " " +L_CURLY "{" +WHITESPACE "\n" +R_CURLY "}" +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/ok/frontmatter.rs b/crates/parser/test_data/lexer/ok/frontmatter.rs new file mode 100644 index 000000000000..be7bf74fdba2 --- /dev/null +++ b/crates/parser/test_data/lexer/ok/frontmatter.rs @@ -0,0 +1,8 @@ + +--- +[dependencies] +clap = "4" +--- + +fn main() { +} diff --git a/crates/parser/test_data/lexer/ok/shebang_frontmatter.rast b/crates/parser/test_data/lexer/ok/shebang_frontmatter.rast new file mode 100644 index 000000000000..fb4787f4001f --- /dev/null +++ b/crates/parser/test_data/lexer/ok/shebang_frontmatter.rast @@ -0,0 +1,13 @@ +SHEBANG "#!/usr/bin/env cargo\n" +FRONTMATTER "\n---\n[dependencies]\nclap = \"4\"\n---\n" +WHITESPACE "\n" +FN_KW "fn" +WHITESPACE " " +IDENT "main" +L_PAREN "(" +R_PAREN ")" +WHITESPACE " " +L_CURLY "{" +WHITESPACE "\n" +R_CURLY "}" +WHITESPACE "\n" diff --git a/crates/parser/test_data/lexer/ok/shebang_frontmatter.rs b/crates/parser/test_data/lexer/ok/shebang_frontmatter.rs new file mode 100644 index 000000000000..090b7713feb3 --- /dev/null +++ b/crates/parser/test_data/lexer/ok/shebang_frontmatter.rs @@ -0,0 +1,9 @@ +#!/usr/bin/env cargo + +--- +[dependencies] +clap = "4" +--- + +fn main() { +} diff --git a/crates/parser/test_data/lexer/ok/single_line_comments.rast b/crates/parser/test_data/lexer/ok/single_line_comments.rast index a7681e9f5086..c4e531b449f7 100644 --- a/crates/parser/test_data/lexer/ok/single_line_comments.rast +++ b/crates/parser/test_data/lexer/ok/single_line_comments.rast @@ -1,5 +1,4 @@ -SHEBANG "#!/usr/bin/env bash" -WHITESPACE "\n" +SHEBANG "#!/usr/bin/env bash\n" COMMENT "// hello" WHITESPACE "\n" COMMENT "//! World" diff --git a/crates/parser/test_data/parser/err/0002_duplicate_shebang.rast b/crates/parser/test_data/parser/err/0002_duplicate_shebang.rast index 3159a15a3b1c..7ee1ecfbb159 100644 --- a/crates/parser/test_data/parser/err/0002_duplicate_shebang.rast +++ b/crates/parser/test_data/parser/err/0002_duplicate_shebang.rast @@ -1,6 +1,5 @@ SOURCE_FILE - SHEBANG "#!/use/bin/env rusti" - WHITESPACE "\n" + SHEBANG "#!/use/bin/env rusti\n" ATTR POUND "#" BANG "!" diff --git a/xtask/src/tidy.rs b/xtask/src/tidy.rs index 0462835f0675..40997eb93d35 100644 --- a/xtask/src/tidy.rs +++ b/xtask/src/tidy.rs @@ -259,7 +259,7 @@ impl TidyDocs { } fn is_exclude_file(d: &Path) -> bool { - let file_names = ["tests.rs", "famous_defs_fixture.rs"]; + let file_names = ["tests.rs", "famous_defs_fixture.rs", "frontmatter.rs"]; d.file_name() .unwrap_or_default()