Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
289: Add a name_span field to Rule. r=ltratt a=ratmice



Co-authored-by: matt rice <ratmice@gmail.com>
  • Loading branch information
bors[bot] and ratmice committed May 14, 2022
2 parents c2cb807 + 2128db8 commit aa63908
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 65 deletions.
3 changes: 3 additions & 0 deletions cfgrammar/src/lib/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@
use serde::{Deserialize, Serialize};

mod idxnewtype;
pub mod span;
pub mod yacc;

pub use span::Span;

/// A type specifically for rule indices.
pub use crate::idxnewtype::{PIdx, RIdx, SIdx, TIdx};

Expand Down
45 changes: 45 additions & 0 deletions cfgrammar/src/lib/span.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

/// A `Span` records what portion of the user's input something (e.g. a lexeme or production)
/// references (i.e. the `Span` doesn't hold a reference / copy of the actual input).
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Span {
start: usize,
end: usize,
}

impl Span {
/// Create a new span starting at byte `start` and ending at byte `end`.
///
/// # Panics
///
/// If `end` is less than `start`.
pub fn new(start: usize, end: usize) -> Self {
if end < start {
panic!("Span starts ({}) after it ends ({})!", start, end);
}
Span { start, end }
}

/// Byte offset of the start of the span.
pub fn start(&self) -> usize {
self.start
}

/// Byte offset of the end of the span.
pub fn end(&self) -> usize {
self.end
}

/// Length in bytes of the span.
pub fn len(&self) -> usize {
self.end - self.start
}

/// Returns `true` if this `Span` covers 0 bytes, or `false` otherwise.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
4 changes: 4 additions & 0 deletions cfgrammar/src/lib/yacc/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use indexmap::{IndexMap, IndexSet};

use super::Precedence;

use crate::Span;

/// An AST representing a grammar. This is built up gradually: when it is finished, the
/// `complete_and_validate` must be called exactly once in order to finish the set-up. At that
/// point, any further mutations made to the struct lead to undefined behaviour.
Expand All @@ -17,6 +19,7 @@ pub struct GrammarAST {
pub rules: IndexMap<String, Rule>,
pub prods: Vec<Production>,
pub tokens: IndexSet<String>,
pub spans: Vec<Span>,
pub precs: HashMap<String, Precedence>,
pub avoid_insert: Option<HashSet<String>>,
pub implicit_tokens: Option<HashSet<String>>,
Expand Down Expand Up @@ -115,6 +118,7 @@ impl GrammarAST {
rules: IndexMap::new(), // Using an IndexMap means that we retain the order
// of rules as they're found in the input file.
prods: Vec::new(),
spans: Vec::new(),
tokens: IndexSet::new(),
precs: HashMap::new(),
avoid_insert: None,
Expand Down
43 changes: 32 additions & 11 deletions cfgrammar/src/lib/yacc/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use super::{
parser::{YaccParser, YaccParserError},
YaccKind,
};
use crate::{PIdx, RIdx, SIdx, Symbol, TIdx};
use crate::{PIdx, RIdx, SIdx, Span, Symbol, TIdx};

const START_RULE: &str = "^";
const IMPLICIT_RULE: &str = "~";
Expand Down Expand Up @@ -42,9 +42,9 @@ pub struct YaccGrammar<StorageT = u32> {
rules_len: RIdx<StorageT>,
/// A mapping from `RIdx` -> `String`.
rule_names: Vec<String>,
/// A mapping from `TIdx` -> `Option<String>`. Every user-specified token will have a name,
/// A mapping from `TIdx` -> `Option<(Span, String)>`. Every user-specified token will have a name,
/// but tokens inserted by cfgrammar (e.g. the EOF token) won't.
token_names: Vec<Option<String>>,
token_names: Vec<Option<(Span, String)>>,
/// A mapping from `TIdx` -> `Option<Precedence>`
token_precs: Vec<Option<Precedence>>,
/// A mapping from `TIdx` -> `Option<String>` for the %epp declaration, giving pretty-printed
Expand Down Expand Up @@ -188,11 +188,11 @@ where
rule_map.insert(v.clone(), RIdx(i.as_()));
}

let mut token_names: Vec<Option<String>> = Vec::with_capacity(ast.tokens.len() + 1);
let mut token_names: Vec<Option<(Span, String)>> = Vec::with_capacity(ast.tokens.len() + 1);
let mut token_precs: Vec<Option<Precedence>> = Vec::with_capacity(ast.tokens.len() + 1);
let mut token_epp: Vec<Option<String>> = Vec::with_capacity(ast.tokens.len() + 1);
for k in &ast.tokens {
token_names.push(Some(k.clone()));
for (i, k) in ast.tokens.iter().enumerate() {
token_names.push(Some((ast.spans[i], k.clone())));
token_precs.push(ast.precs.get(k).cloned());
token_epp.push(Some(ast.epp.get(k).unwrap_or(k).clone()));
}
Expand All @@ -202,7 +202,7 @@ where
token_epp.push(None);
let mut token_map = HashMap::<String, TIdx<StorageT>>::new();
for (i, v) in token_names.iter().enumerate() {
if let Some(n) = v.as_ref() {
if let Some((_, n)) = v.as_ref() {
token_map.insert(n.clone(), TIdx(i.as_()));
}
}
Expand Down Expand Up @@ -460,7 +460,9 @@ where
/// Return the name of token `tidx` (where `None` indicates "the rule has no name"). Panics if
/// `tidx` doesn't exist.
pub fn token_name(&self, tidx: TIdx<StorageT>) -> Option<&str> {
self.token_names[usize::from(tidx)].as_deref()
self.token_names[usize::from(tidx)]
.as_ref()
.map(|x| x.1.as_str())
}

/// Return the precedence of token `tidx` (where `None` indicates "no precedence specified").
Expand All @@ -474,6 +476,11 @@ where
pub fn token_epp(&self, tidx: TIdx<StorageT>) -> Option<&str> {
self.token_epp[usize::from(tidx)].as_deref()
}
pub fn token_span(&self, tidx: TIdx<StorageT>) -> Option<&Span> {
self.token_names[usize::from(tidx)]
.as_ref()
.map(|(span, _)| span)
}

/// Get the action for production `pidx`. Panics if `pidx` doesn't exist.
pub fn action(&self, pidx: PIdx<StorageT>) -> &Option<String> {
Expand All @@ -498,7 +505,7 @@ where
pub fn tokens_map(&self) -> HashMap<&str, TIdx<StorageT>> {
let mut m = HashMap::with_capacity(usize::from(self.tokens_len) - 1);
for tidx in self.iter_tidxs() {
if let Some(n) = self.token_names[usize::from(tidx)].as_ref() {
if let Some((_, n)) = self.token_names[usize::from(tidx)].as_ref() {
m.insert(&**n, tidx);
}
}
Expand All @@ -509,7 +516,7 @@ where
pub fn token_idx(&self, n: &str) -> Option<TIdx<StorageT>> {
self.token_names
.iter()
.position(|x| x.as_ref().map_or(false, |x| x == n))
.position(|x| x.as_ref().map_or(false, |(_, x)| x == n))
// The call to as_() is safe because token_names is guaranteed to be small
// enough to fit into StorageT
.map(|x| TIdx(x.as_()))
Expand Down Expand Up @@ -1032,7 +1039,7 @@ mod test {
super::{AssocKind, Precedence, YaccGrammar, YaccKind, YaccOriginalActionKind},
rule_max_costs, rule_min_costs, IMPLICIT_RULE, IMPLICIT_START_RULE,
};
use crate::{PIdx, RIdx, Symbol, TIdx};
use crate::{PIdx, RIdx, Span, Symbol, TIdx};
use std::collections::HashMap;

#[test]
Expand Down Expand Up @@ -1460,4 +1467,18 @@ mod test {
]
);
}

#[test]
fn test_token_spans() {
let src = "%%\nAB: 'a' | 'foo';";
let grm =
YaccGrammar::new(YaccKind::Original(YaccOriginalActionKind::NoAction), src).unwrap();
let token_map = grm.tokens_map();
let a_tidx = token_map.get("a");
let foo_tidx = token_map.get("foo");
let a_span = grm.token_span(*a_tidx.unwrap());
let foo_span = grm.token_span(*foo_tidx.unwrap());
assert_eq!(a_span, Some(&Span::new(8, 9)));
assert_eq!(foo_span, Some(&Span::new(14, 17)));
}
}
38 changes: 27 additions & 11 deletions cfgrammar/src/lib/yacc/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use regex::Regex;

type YaccResult<T> = Result<T, YaccParserError>;

use crate::Span;

use super::{
ast::{GrammarAST, Symbol},
AssocKind, Precedence, YaccKind,
Expand Down Expand Up @@ -144,8 +146,9 @@ impl YaccParser {
if self.lookahead_is("%", i).is_some() {
break;
}
let (j, n) = self.parse_token(i)?;
let (j, n, span) = self.parse_token(i)?;
self.ast.tokens.insert(n);
self.ast.spans.push(span);
i = self.parse_ws(j, true)?;
}
continue;
Expand Down Expand Up @@ -176,7 +179,7 @@ impl YaccParser {
}
if let Some(j) = self.lookahead_is("%epp", i) {
i = self.parse_ws(j, false)?;
let (j, n) = self.parse_token(i)?;
let (j, n, _) = self.parse_token(i)?;
if self.ast.epp.contains_key(&n) {
return Err(self.mk_error(YaccParserErrorKind::DuplicateEPP, i));
}
Expand Down Expand Up @@ -213,8 +216,9 @@ impl YaccParser {
self.ast.avoid_insert = Some(HashSet::new());
}
while j < self.src.len() && self.newlines.len() == num_newlines {
let (j, n) = self.parse_token(i)?;
let (j, n, span) = self.parse_token(i)?;
self.ast.tokens.insert(n.clone());
self.ast.spans.push(span);
if self.ast.avoid_insert.as_ref().unwrap().contains(&n) {
return Err(
self.mk_error(YaccParserErrorKind::DuplicateAvoidInsertDeclaration, i)
Expand Down Expand Up @@ -247,8 +251,9 @@ impl YaccParser {
self.ast.implicit_tokens = Some(HashSet::new());
}
while j < self.src.len() && self.newlines.len() == num_newlines {
let (j, n) = self.parse_token(i)?;
let (j, n, span) = self.parse_token(i)?;
self.ast.tokens.insert(n.clone());
self.ast.spans.push(span);
if self.ast.implicit_tokens.as_ref().unwrap().contains(&n) {
return Err(self.mk_error(
YaccParserErrorKind::DuplicateImplicitTokensDeclaration,
Expand Down Expand Up @@ -280,7 +285,7 @@ impl YaccParser {
i = self.parse_ws(k, false)?;
let num_newlines = self.newlines.len();
while i < self.src.len() && num_newlines == self.newlines.len() {
let (j, n) = self.parse_token(i)?;
let (j, n, _) = self.parse_token(i)?;
if self.ast.precs.contains_key(&n) {
return Err(self.mk_error(YaccParserErrorKind::DuplicatePrecedence, i));
}
Expand Down Expand Up @@ -365,13 +370,14 @@ impl YaccParser {
}

if self.lookahead_is("\"", i).is_some() || self.lookahead_is("'", i).is_some() {
let (j, sym) = self.parse_token(i)?;
let (j, sym, span) = self.parse_token(i)?;
i = self.parse_ws(j, true)?;
self.ast.tokens.insert(sym.clone());
self.ast.spans.push(span);
syms.push(Symbol::Token(sym));
} else if let Some(j) = self.lookahead_is("%prec", i) {
i = self.parse_ws(j, true)?;
let (k, sym) = self.parse_token(i)?;
let (k, sym, _) = self.parse_token(i)?;
if self.ast.tokens.contains(&sym) {
prec = Some(sym);
} else {
Expand All @@ -383,7 +389,7 @@ impl YaccParser {
i = j;
action = Some(a);
} else {
let (j, sym) = self.parse_token(i)?;
let (j, sym, _) = self.parse_token(i)?;
if self.ast.tokens.contains(&sym) {
syms.push(Symbol::Token(sym));
} else {
Expand All @@ -406,16 +412,26 @@ impl YaccParser {
}
}

fn parse_token(&self, i: usize) -> YaccResult<(usize, String)> {
fn parse_token(&self, i: usize) -> YaccResult<(usize, String, Span)> {
match RE_TOKEN.find(&self.src[i..]) {
Some(m) => {
assert!(m.start() == 0 && m.end() > 0);
match self.src[i..].chars().next().unwrap() {
'"' | '\'' => {
debug_assert!('"'.len_utf8() == 1 && '\''.len_utf8() == 1);
Ok((i + m.end(), self.src[i + 1..i + m.end() - 1].to_string()))
let start_cidx = i + 1;
let end_cidx = i + m.end() - 1;
Ok((
i + m.end(),
self.src[start_cidx..end_cidx].to_string(),
Span::new(start_cidx, end_cidx),
))
}
_ => Ok((i + m.end(), self.src[i..i + m.end()].to_string())),
_ => Ok((
i + m.end(),
self.src[i..i + m.end()].to_string(),
Span::new(i, i + m.end()),
)),
}
}
None => Err(self.mk_error(YaccParserErrorKind::IllegalString, i)),
Expand Down
8 changes: 7 additions & 1 deletion lrlex/src/lib/ctbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,17 @@ pub fn lexerdef() -> {lexerdef_type} {{
Some(ref n) => format!("Some({:?}.to_string())", n),
None => "None".to_owned(),
};
let n_span = format!(
"lrpar::Span::new({}, {})",
r.name_span.start(),
r.name_span.end()
);
outs.push_str(&format!(
"
Rule::new({}, {}, \"{}\".to_string()).unwrap(),",
Rule::new({}, {}, {}, \"{}\".to_string()).unwrap(),",
tok_id,
n,
n_span,
r.re_str.replace('\\', "\\\\").replace('"', "\\\"")
));
}
Expand Down
26 changes: 26 additions & 0 deletions lrlex/src/lib/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pub struct Rule<StorageT> {
/// This rule's name. If None, then text which matches this rule will be skipped (i.e. will not
/// create a lexeme).
pub name: Option<String>,
pub name_span: Span,
pub(super) re_str: String,
re: Regex,
}
Expand All @@ -35,6 +36,7 @@ impl<StorageT> Rule<StorageT> {
pub fn new(
tok_id: Option<StorageT>,
name: Option<String>,
name_span: Span,
re_str: String,
) -> Result<Rule<StorageT>, regex::Error> {
let re = RegexBuilder::new(&format!("\\A(?:{})", &re_str))
Expand All @@ -44,6 +46,7 @@ impl<StorageT> Rule<StorageT> {
Ok(Rule {
tok_id,
name,
name_span,
re_str,
re,
})
Expand Down Expand Up @@ -661,4 +664,27 @@ if 'IF'
assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 1), (2, 3)));
assert_eq!(lexer.span_lines_str(lexemes[0].span()), "'a\nb'");
}

#[test]
fn test_token_span() {
let src = "%%
a 'A'
b 'B'
[ \\n] ;"
.to_string();
let lexerdef = LRNonStreamingLexerDef::<DefaultLexeme<u8>, u8>::from_str(&src).unwrap();
assert_eq!(
lexerdef.get_rule_by_name("A").unwrap().name_span,
lrpar::Span::new(6, 7)
);
assert_eq!(
lexerdef.get_rule_by_name("B").unwrap().name_span,
lrpar::Span::new(12, 13)
);
let anonymous_rules = lexerdef
.iter_rules()
.filter(|rule| rule.name.is_none())
.collect::<Vec<_>>();
assert_eq!(anonymous_rules[0].name_span, lrpar::Span::new(21, 21));
}
}
Loading

0 comments on commit aa63908

Please sign in to comment.