Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ rustc-literal-escaper.workspace = true
tracing = { workspace = true, optional = true }

edition.workspace = true
winnow = { version = "0.7.13", default-features = false }

[dev-dependencies]
expect-test = "1.5.1"
Expand Down
348 changes: 348 additions & 0 deletions crates/parser/src/frontmatter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
// Copied from https://github.com/rust-lang/cargo/blob/367fd9f213750cd40317803dd0a5a3ce3f0c676d/src/cargo/util/frontmatter.rs
#![expect(dead_code)] // avoid editing
#![expect(unreachable_pub)] // avoid editing
#![expect(clippy::useless_format)] // avoid editing

type Span = std::ops::Range<usize>;

#[derive(Debug)]
pub struct ScriptSource<'s> {
/// The full file
raw: &'s str,
/// The `#!/usr/bin/env cargo` line, if present
shebang: Option<Span>,
/// The code fence opener (`---`)
open: Option<Span>,
/// Trailing text after `ScriptSource::open` that identifies the meaning of
/// `ScriptSource::frontmatter`
info: Option<Span>,
/// The lines between `ScriptSource::open` and `ScriptSource::close`
frontmatter: Option<Span>,
/// The code fence closer (`---`)
close: Option<Span>,
/// All content after the frontmatter and shebang
content: Span,
}

impl<'s> ScriptSource<'s> {
pub fn parse(raw: &'s str) -> Result<Self, FrontmatterError> {
use winnow::stream::FindSlice as _;
use winnow::stream::Location as _;
use winnow::stream::Offset as _;
use winnow::stream::Stream as _;

let content_end = raw.len();
let mut source = Self {
raw,
shebang: None,
open: None,
info: None,
frontmatter: None,
close: None,
content: 0..content_end,
};

let mut input = winnow::stream::LocatingSlice::new(raw);

if let Some(shebang_end) = strip_shebang(input.as_ref()) {
let shebang_start = input.current_token_start();
let _ = input.next_slice(shebang_end);
let shebang_end = input.current_token_start();
source.shebang = Some(shebang_start..shebang_end);
source.content = shebang_end..content_end;
}

// Whitespace may precede a frontmatter but must end with a newline
if let Some(nl_end) = strip_ws_lines(input.as_ref()) {
let _ = input.next_slice(nl_end);
}

// Opens with a line that starts with 3 or more `-` followed by an optional identifier
const FENCE_CHAR: char = '-';
let fence_length = input
.as_ref()
.char_indices()
.find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
.unwrap_or_else(|| input.eof_offset());
let open_start = input.current_token_start();
let fence_pattern = input.next_slice(fence_length);
let open_end = input.current_token_start();
match fence_length {
0 => {
return Ok(source);
}
1 | 2 => {
// either not a frontmatter or invalid frontmatter opening
return Err(FrontmatterError::new(
format!(
"found {fence_length} `{FENCE_CHAR}` in rust frontmatter, expected at least 3"
),
raw.len()..raw.len(),
).push_visible_span(open_start..open_end));
}
_ => {}
}
source.open = Some(open_start..open_end);
let Some(info_nl) = input.find_slice("\n") else {
return Err(FrontmatterError::new(
format!("unclosed frontmatter; expected `{fence_pattern}`"),
raw.len()..raw.len(),
)
.push_visible_span(open_start..open_end));
};
let info = input.next_slice(info_nl.start);
let info = info.strip_suffix('\r').unwrap_or(info); // already excludes `\n`
let info = info.trim_matches(is_horizontal_whitespace);
if !info.is_empty() {
let info_start = info.offset_from(&raw);
let info_end = info_start + info.len();
source.info = Some(info_start..info_end);
}

// Ends with a line that starts with a matching number of `-` only followed by whitespace
let nl_fence_pattern = format!("\n{fence_pattern}");
let Some(frontmatter_nl) = input.find_slice(nl_fence_pattern.as_str()) else {
for len in (2..(nl_fence_pattern.len() - 1)).rev() {
let Some(frontmatter_nl) = input.find_slice(&nl_fence_pattern[0..len]) else {
continue;
};
let _ = input.next_slice(frontmatter_nl.start + 1);
let close_start = input.current_token_start();
let _ = input.next_slice(len);
let close_end = input.current_token_start();
let fewer_dashes = fence_length - len;
return Err(FrontmatterError::new(
format!(
"closing code fence has {fewer_dashes} less `-` than the opening fence"
),
close_start..close_end,
)
.push_visible_span(open_start..open_end));
}
return Err(FrontmatterError::new(
format!("unclosed frontmatter; expected `{fence_pattern}`"),
raw.len()..raw.len(),
)
.push_visible_span(open_start..open_end));
};
let frontmatter_start = input.current_token_start() + 1; // skip nl from infostring
let _ = input.next_slice(frontmatter_nl.start + 1);
let frontmatter_end = input.current_token_start();
source.frontmatter = Some(frontmatter_start..frontmatter_end);
let close_start = input.current_token_start();
let _ = input.next_slice(fence_length);
let close_end = input.current_token_start();
source.close = Some(close_start..close_end);

let nl = input.find_slice("\n");
let after_closing_fence =
input.next_slice(nl.map(|span| span.end).unwrap_or_else(|| input.eof_offset()));
let content_start = input.current_token_start();
let extra_dashes = after_closing_fence.chars().take_while(|b| *b == FENCE_CHAR).count();
if 0 < extra_dashes {
let extra_start = close_end;
let extra_end = extra_start + extra_dashes;
return Err(FrontmatterError::new(
format!("closing code fence has {extra_dashes} more `-` than the opening fence"),
extra_start..extra_end,
)
.push_visible_span(open_start..open_end));
} else {
let after_closing_fence = strip_newline(after_closing_fence);
let after_closing_fence = after_closing_fence.trim_matches(is_horizontal_whitespace);
if !after_closing_fence.is_empty() {
// extra characters beyond the original fence pattern
let after_start = after_closing_fence.offset_from(&raw);
let after_end = after_start + after_closing_fence.len();
return Err(FrontmatterError::new(
format!("unexpected characters after frontmatter close"),
after_start..after_end,
)
.push_visible_span(open_start..open_end));
}
}

source.content = content_start..content_end;

if let Some(nl_end) = strip_ws_lines(input.as_ref()) {
let _ = input.next_slice(nl_end);
}
let fence_length = input
.as_ref()
.char_indices()
.find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
.unwrap_or_else(|| input.eof_offset());
if 0 < fence_length {
let fence_start = input.current_token_start();
let fence_end = fence_start + fence_length;
return Err(FrontmatterError::new(
format!("only one frontmatter is supported"),
fence_start..fence_end,
)
.push_visible_span(open_start..open_end)
.push_visible_span(close_start..close_end));
}

Ok(source)
}

pub fn shebang(&self) -> Option<&'s str> {
self.shebang.clone().map(|span| &self.raw[span])
}

pub fn shebang_span(&self) -> Option<Span> {
self.shebang.clone()
}

pub fn open_span(&self) -> Option<Span> {
self.open.clone()
}

pub fn info(&self) -> Option<&'s str> {
self.info.clone().map(|span| &self.raw[span])
}

pub fn info_span(&self) -> Option<Span> {
self.info.clone()
}

pub fn frontmatter(&self) -> Option<&'s str> {
self.frontmatter.clone().map(|span| &self.raw[span])
}

pub fn frontmatter_span(&self) -> Option<Span> {
self.frontmatter.clone()
}

pub fn close_span(&self) -> Option<Span> {
self.close.clone()
}

pub fn content(&self) -> &'s str {
&self.raw[self.content.clone()]
}

pub fn content_span(&self) -> Span {
self.content.clone()
}
}

/// Returns the index after the shebang line, if present
pub fn strip_shebang(input: &str) -> Option<usize> {
// See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
// Shebang must start with `#!` literally, without any preceding whitespace.
// For simplicity we consider any line starting with `#!` a shebang,
// regardless of restrictions put on shebangs by specific platforms.
if let Some(rest) = input.strip_prefix("#!") {
// Ok, this is a shebang but if the next non-whitespace token is `[`,
// then it may be valid Rust code, so consider it Rust code.
//
// NOTE: rustc considers line and block comments to be whitespace but to avoid
// any more awareness of Rust grammar, we are excluding it.
if !rest.trim_start().starts_with('[') {
// No other choice than to consider this a shebang.
let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len());
return Some(newline_end);
}
}
None
}

/// Returns the index after any lines with only whitespace, if present
pub fn strip_ws_lines(input: &str) -> Option<usize> {
let ws_end = input.find(|c| !is_whitespace(c)).unwrap_or(input.len());
if ws_end == 0 {
return None;
}

let nl_start = input[0..ws_end].rfind('\n')?;
let nl_end = nl_start + 1;
Some(nl_end)
}

/// True if `c` is considered a whitespace according to Rust language definition.
/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
/// for definitions of these classes.
fn is_whitespace(c: char) -> bool {
// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.

matches!(
c,
// End-of-line characters
| '\u{000A}' // line feed (\n)
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // carriage return (\r)
| '\u{0085}' // next line (from latin1)
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR

// `Default_Ignorable_Code_Point` characters
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK

// Horizontal space characters
| '\u{0009}' // tab (\t)
| '\u{0020}' // space
)
}

/// True if `c` is considered horizontal whitespace according to Rust language definition.
fn is_horizontal_whitespace(c: char) -> bool {
// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.

matches!(
c,
// Horizontal space characters
'\u{0009}' // tab (\t)
| '\u{0020}' // space
)
}

fn strip_newline(text: &str) -> &str {
text.strip_suffix("\r\n").or_else(|| text.strip_suffix('\n')).unwrap_or(text)
}

#[derive(Debug)]
pub struct FrontmatterError {
message: String,
primary_span: Span,
visible_spans: Vec<Span>,
}

impl FrontmatterError {
pub fn new(message: impl Into<String>, span: Span) -> Self {
Self { message: message.into(), primary_span: span, visible_spans: Vec::new() }
}

pub fn push_visible_span(mut self, span: Span) -> Self {
self.visible_spans.push(span);
self
}

pub fn message(&self) -> &str {
self.message.as_str()
}

pub fn primary_span(&self) -> Span {
self.primary_span.clone()
}

pub fn visible_spans(&self) -> &[Span] {
&self.visible_spans
}
}

impl std::fmt::Display for FrontmatterError {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.message.fmt(fmt)
}
}

impl std::error::Error for FrontmatterError {}
15 changes: 11 additions & 4 deletions crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,17 @@ impl<'a> LexedStr<'a> {
pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
let _p = tracing::info_span!("LexedStr::new").entered();
let mut conv = Converter::new(edition, text);
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
conv.res.push(SHEBANG, conv.offset);
conv.offset = shebang_len;
};
if let Ok(script) = crate::frontmatter::ScriptSource::parse(text) {
if let Some(shebang) = script.shebang_span() {
conv.push(SHEBANG, shebang.end - shebang.start, Vec::new());
}
if script.frontmatter().is_some() {
conv.push(FRONTMATTER, script.content_span().start - conv.offset, Vec::new());
}
} else if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
// Leave error reporting to `rustc_lexer`
conv.push(SHEBANG, shebang_len, Vec::new());
}

// Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
// but we want to split it to two in edition <2024.
Expand Down
1 change: 1 addition & 0 deletions crates/parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern crate ra_ap_rustc_lexer as rustc_lexer;
extern crate rustc_lexer;

mod event;
mod frontmatter;
mod grammar;
mod input;
mod lexed_str;
Expand Down
Loading