Merge softdevteam#289

289: Add a name_span field to Rule. r=ltratt a=ratmice Co-authored-by: matt rice <ratmice@gmail.com>
ratmice · May 14, 2022 · aa63908 · aa63908
2 parents c2cb807 + 2128db8
commit aa63908
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 65 deletions.
diff --git a/cfgrammar/src/lib/mod.rs b/cfgrammar/src/lib/mod.rs
@@ -53,8 +53,11 @@
 use serde::{Deserialize, Serialize};
 
 mod idxnewtype;
+pub mod span;
 pub mod yacc;
 
+pub use span::Span;
+
 /// A type specifically for rule indices.
 pub use crate::idxnewtype::{PIdx, RIdx, SIdx, TIdx};
 

diff --git a/cfgrammar/src/lib/span.rs b/cfgrammar/src/lib/span.rs
@@ -0,0 +1,45 @@
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+/// A `Span` records what portion of the user's input something (e.g. a lexeme or production)
+/// references (i.e. the `Span` doesn't hold a reference / copy of the actual input).
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub struct Span {
+    start: usize,
+    end: usize,
+}
+
+impl Span {
+    /// Create a new span starting at byte `start` and ending at byte `end`.
+    ///
+    /// # Panics
+    ///
+    /// If `end` is less than `start`.
+    pub fn new(start: usize, end: usize) -> Self {
+        if end < start {
+            panic!("Span starts ({}) after it ends ({})!", start, end);
+        }
+        Span { start, end }
+    }
+
+    /// Byte offset of the start of the span.
+    pub fn start(&self) -> usize {
+        self.start
+    }
+
+    /// Byte offset of the end of the span.
+    pub fn end(&self) -> usize {
+        self.end
+    }
+
+    /// Length in bytes of the span.
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+
+    /// Returns `true` if this `Span` covers 0 bytes, or `false` otherwise.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
diff --git a/cfgrammar/src/lib/yacc/ast.rs b/cfgrammar/src/lib/yacc/ast.rs
@@ -8,6 +8,8 @@ use indexmap::{IndexMap, IndexSet};
 
 use super::Precedence;
 
+use crate::Span;
+
 /// An AST representing a grammar. This is built up gradually: when it is finished, the
 /// `complete_and_validate` must be called exactly once in order to finish the set-up. At that
 /// point, any further mutations made to the struct lead to undefined behaviour.
@@ -17,6 +19,7 @@ pub struct GrammarAST {
     pub rules: IndexMap<String, Rule>,
     pub prods: Vec<Production>,
     pub tokens: IndexSet<String>,
+    pub spans: Vec<Span>,
     pub precs: HashMap<String, Precedence>,
     pub avoid_insert: Option<HashSet<String>>,
     pub implicit_tokens: Option<HashSet<String>>,
@@ -115,6 +118,7 @@ impl GrammarAST {
             rules: IndexMap::new(), // Using an IndexMap means that we retain the order
             // of rules as they're found in the input file.
             prods: Vec::new(),
+            spans: Vec::new(),
             tokens: IndexSet::new(),
             precs: HashMap::new(),
             avoid_insert: None,

diff --git a/cfgrammar/src/lib/yacc/grammar.rs b/cfgrammar/src/lib/yacc/grammar.rs
@@ -12,7 +12,7 @@ use super::{
     parser::{YaccParser, YaccParserError},
     YaccKind,
 };
-use crate::{PIdx, RIdx, SIdx, Symbol, TIdx};
+use crate::{PIdx, RIdx, SIdx, Span, Symbol, TIdx};
 
 const START_RULE: &str = "^";
 const IMPLICIT_RULE: &str = "~";
@@ -42,9 +42,9 @@ pub struct YaccGrammar<StorageT = u32> {
     rules_len: RIdx<StorageT>,
     /// A mapping from `RIdx` -> `String`.
     rule_names: Vec<String>,
-    /// A mapping from `TIdx` -> `Option<String>`. Every user-specified token will have a name,
+    /// A mapping from `TIdx` -> `Option<(Span, String)>`. Every user-specified token will have a name,
     /// but tokens inserted by cfgrammar (e.g. the EOF token) won't.
-    token_names: Vec<Option<String>>,
+    token_names: Vec<Option<(Span, String)>>,
     /// A mapping from `TIdx` -> `Option<Precedence>`
     token_precs: Vec<Option<Precedence>>,
     /// A mapping from `TIdx` -> `Option<String>` for the %epp declaration, giving pretty-printed
@@ -188,11 +188,11 @@ where
             rule_map.insert(v.clone(), RIdx(i.as_()));
         }
 
-        let mut token_names: Vec<Option<String>> = Vec::with_capacity(ast.tokens.len() + 1);
+        let mut token_names: Vec<Option<(Span, String)>> = Vec::with_capacity(ast.tokens.len() + 1);
         let mut token_precs: Vec<Option<Precedence>> = Vec::with_capacity(ast.tokens.len() + 1);
         let mut token_epp: Vec<Option<String>> = Vec::with_capacity(ast.tokens.len() + 1);
-        for k in &ast.tokens {
-            token_names.push(Some(k.clone()));
+        for (i, k) in ast.tokens.iter().enumerate() {
+            token_names.push(Some((ast.spans[i], k.clone())));
             token_precs.push(ast.precs.get(k).cloned());
             token_epp.push(Some(ast.epp.get(k).unwrap_or(k).clone()));
         }
@@ -202,7 +202,7 @@ where
         token_epp.push(None);
         let mut token_map = HashMap::<String, TIdx<StorageT>>::new();
         for (i, v) in token_names.iter().enumerate() {
-            if let Some(n) = v.as_ref() {
+            if let Some((_, n)) = v.as_ref() {
                 token_map.insert(n.clone(), TIdx(i.as_()));
             }
         }
@@ -460,7 +460,9 @@ where
     /// Return the name of token `tidx` (where `None` indicates "the rule has no name"). Panics if
     /// `tidx` doesn't exist.
     pub fn token_name(&self, tidx: TIdx<StorageT>) -> Option<&str> {
-        self.token_names[usize::from(tidx)].as_deref()
+        self.token_names[usize::from(tidx)]
+            .as_ref()
+            .map(|x| x.1.as_str())
     }
 
     /// Return the precedence of token `tidx` (where `None` indicates "no precedence specified").
@@ -474,6 +476,11 @@ where
     pub fn token_epp(&self, tidx: TIdx<StorageT>) -> Option<&str> {
         self.token_epp[usize::from(tidx)].as_deref()
     }
+    pub fn token_span(&self, tidx: TIdx<StorageT>) -> Option<&Span> {
+        self.token_names[usize::from(tidx)]
+            .as_ref()
+            .map(|(span, _)| span)
+    }
 
     /// Get the action for production `pidx`. Panics if `pidx` doesn't exist.
     pub fn action(&self, pidx: PIdx<StorageT>) -> &Option<String> {
@@ -498,7 +505,7 @@ where
     pub fn tokens_map(&self) -> HashMap<&str, TIdx<StorageT>> {
         let mut m = HashMap::with_capacity(usize::from(self.tokens_len) - 1);
         for tidx in self.iter_tidxs() {
-            if let Some(n) = self.token_names[usize::from(tidx)].as_ref() {
+            if let Some((_, n)) = self.token_names[usize::from(tidx)].as_ref() {
                 m.insert(&**n, tidx);
             }
         }
@@ -509,7 +516,7 @@ where
     pub fn token_idx(&self, n: &str) -> Option<TIdx<StorageT>> {
         self.token_names
             .iter()
-            .position(|x| x.as_ref().map_or(false, |x| x == n))
+            .position(|x| x.as_ref().map_or(false, |(_, x)| x == n))
             // The call to as_() is safe because token_names is guaranteed to be small
             // enough to fit into StorageT
             .map(|x| TIdx(x.as_()))
@@ -1032,7 +1039,7 @@ mod test {
         super::{AssocKind, Precedence, YaccGrammar, YaccKind, YaccOriginalActionKind},
         rule_max_costs, rule_min_costs, IMPLICIT_RULE, IMPLICIT_START_RULE,
     };
-    use crate::{PIdx, RIdx, Symbol, TIdx};
+    use crate::{PIdx, RIdx, Span, Symbol, TIdx};
     use std::collections::HashMap;
 
     #[test]
@@ -1460,4 +1467,18 @@ mod test {
             ]
         );
     }
+
+    #[test]
+    fn test_token_spans() {
+        let src = "%%\nAB: 'a' | 'foo';";
+        let grm =
+            YaccGrammar::new(YaccKind::Original(YaccOriginalActionKind::NoAction), src).unwrap();
+        let token_map = grm.tokens_map();
+        let a_tidx = token_map.get("a");
+        let foo_tidx = token_map.get("foo");
+        let a_span = grm.token_span(*a_tidx.unwrap());
+        let foo_span = grm.token_span(*foo_tidx.unwrap());
+        assert_eq!(a_span, Some(&Span::new(8, 9)));
+        assert_eq!(foo_span, Some(&Span::new(14, 17)));
+    }
 }
diff --git a/cfgrammar/src/lib/yacc/parser.rs b/cfgrammar/src/lib/yacc/parser.rs
@@ -8,6 +8,8 @@ use regex::Regex;
 
 type YaccResult<T> = Result<T, YaccParserError>;
 
+use crate::Span;
+
 use super::{
     ast::{GrammarAST, Symbol},
     AssocKind, Precedence, YaccKind,
@@ -144,8 +146,9 @@ impl YaccParser {
                     if self.lookahead_is("%", i).is_some() {
                         break;
                     }
-                    let (j, n) = self.parse_token(i)?;
+                    let (j, n, span) = self.parse_token(i)?;
                     self.ast.tokens.insert(n);
+                    self.ast.spans.push(span);
                     i = self.parse_ws(j, true)?;
                 }
                 continue;
@@ -176,7 +179,7 @@ impl YaccParser {
             }
             if let Some(j) = self.lookahead_is("%epp", i) {
                 i = self.parse_ws(j, false)?;
-                let (j, n) = self.parse_token(i)?;
+                let (j, n, _) = self.parse_token(i)?;
                 if self.ast.epp.contains_key(&n) {
                     return Err(self.mk_error(YaccParserErrorKind::DuplicateEPP, i));
                 }
@@ -213,8 +216,9 @@ impl YaccParser {
                     self.ast.avoid_insert = Some(HashSet::new());
                 }
                 while j < self.src.len() && self.newlines.len() == num_newlines {
-                    let (j, n) = self.parse_token(i)?;
+                    let (j, n, span) = self.parse_token(i)?;
                     self.ast.tokens.insert(n.clone());
+                    self.ast.spans.push(span);
                     if self.ast.avoid_insert.as_ref().unwrap().contains(&n) {
                         return Err(
                             self.mk_error(YaccParserErrorKind::DuplicateAvoidInsertDeclaration, i)
@@ -247,8 +251,9 @@ impl YaccParser {
                         self.ast.implicit_tokens = Some(HashSet::new());
                     }
                     while j < self.src.len() && self.newlines.len() == num_newlines {
-                        let (j, n) = self.parse_token(i)?;
+                        let (j, n, span) = self.parse_token(i)?;
                         self.ast.tokens.insert(n.clone());
+                        self.ast.spans.push(span);
                         if self.ast.implicit_tokens.as_ref().unwrap().contains(&n) {
                             return Err(self.mk_error(
                                 YaccParserErrorKind::DuplicateImplicitTokensDeclaration,
@@ -280,7 +285,7 @@ impl YaccParser {
                 i = self.parse_ws(k, false)?;
                 let num_newlines = self.newlines.len();
                 while i < self.src.len() && num_newlines == self.newlines.len() {
-                    let (j, n) = self.parse_token(i)?;
+                    let (j, n, _) = self.parse_token(i)?;
                     if self.ast.precs.contains_key(&n) {
                         return Err(self.mk_error(YaccParserErrorKind::DuplicatePrecedence, i));
                     }
@@ -365,13 +370,14 @@ impl YaccParser {
             }
 
             if self.lookahead_is("\"", i).is_some() || self.lookahead_is("'", i).is_some() {
-                let (j, sym) = self.parse_token(i)?;
+                let (j, sym, span) = self.parse_token(i)?;
                 i = self.parse_ws(j, true)?;
                 self.ast.tokens.insert(sym.clone());
+                self.ast.spans.push(span);
                 syms.push(Symbol::Token(sym));
             } else if let Some(j) = self.lookahead_is("%prec", i) {
                 i = self.parse_ws(j, true)?;
-                let (k, sym) = self.parse_token(i)?;
+                let (k, sym, _) = self.parse_token(i)?;
                 if self.ast.tokens.contains(&sym) {
                     prec = Some(sym);
                 } else {
@@ -383,7 +389,7 @@ impl YaccParser {
                 i = j;
                 action = Some(a);
             } else {
-                let (j, sym) = self.parse_token(i)?;
+                let (j, sym, _) = self.parse_token(i)?;
                 if self.ast.tokens.contains(&sym) {
                     syms.push(Symbol::Token(sym));
                 } else {
@@ -406,16 +412,26 @@ impl YaccParser {
         }
     }
 
-    fn parse_token(&self, i: usize) -> YaccResult<(usize, String)> {
+    fn parse_token(&self, i: usize) -> YaccResult<(usize, String, Span)> {
         match RE_TOKEN.find(&self.src[i..]) {
             Some(m) => {
                 assert!(m.start() == 0 && m.end() > 0);
                 match self.src[i..].chars().next().unwrap() {
                     '"' | '\'' => {
                         debug_assert!('"'.len_utf8() == 1 && '\''.len_utf8() == 1);
-                        Ok((i + m.end(), self.src[i + 1..i + m.end() - 1].to_string()))
+                        let start_cidx = i + 1;
+                        let end_cidx = i + m.end() - 1;
+                        Ok((
+                            i + m.end(),
+                            self.src[start_cidx..end_cidx].to_string(),
+                            Span::new(start_cidx, end_cidx),
+                        ))
                     }
-                    _ => Ok((i + m.end(), self.src[i..i + m.end()].to_string())),
+                    _ => Ok((
+                        i + m.end(),
+                        self.src[i..i + m.end()].to_string(),
+                        Span::new(i, i + m.end()),
+                    )),
                 }
             }
             None => Err(self.mk_error(YaccParserErrorKind::IllegalString, i)),

diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs
@@ -400,11 +400,17 @@ pub fn lexerdef() -> {lexerdef_type} {{
                 Some(ref n) => format!("Some({:?}.to_string())", n),
                 None => "None".to_owned(),
             };
+            let n_span = format!(
+                "lrpar::Span::new({}, {})",
+                r.name_span.start(),
+                r.name_span.end()
+            );
             outs.push_str(&format!(
                 "
-        Rule::new({}, {}, \"{}\".to_string()).unwrap(),",
+        Rule::new({}, {}, {}, \"{}\".to_string()).unwrap(),",
                 tok_id,
                 n,
+                n_span,
                 r.re_str.replace('\\', "\\\\").replace('"', "\\\"")
             ));
         }

diff --git a/lrlex/src/lib/lexer.rs b/lrlex/src/lib/lexer.rs
@@ -24,6 +24,7 @@ pub struct Rule<StorageT> {
     /// This rule's name. If None, then text which matches this rule will be skipped (i.e. will not
     /// create a lexeme).
     pub name: Option<String>,
+    pub name_span: Span,
     pub(super) re_str: String,
     re: Regex,
 }
@@ -35,6 +36,7 @@ impl<StorageT> Rule<StorageT> {
     pub fn new(
         tok_id: Option<StorageT>,
         name: Option<String>,
+        name_span: Span,
         re_str: String,
     ) -> Result<Rule<StorageT>, regex::Error> {
         let re = RegexBuilder::new(&format!("\\A(?:{})", &re_str))
@@ -44,6 +46,7 @@ impl<StorageT> Rule<StorageT> {
         Ok(Rule {
             tok_id,
             name,
+            name_span,
             re_str,
             re,
         })
@@ -661,4 +664,27 @@ if 'IF'
         assert_eq!(lexer.line_col(lexemes[0].span()), ((1, 1), (2, 3)));
         assert_eq!(lexer.span_lines_str(lexemes[0].span()), "'a\nb'");
     }
+
+    #[test]
+    fn test_token_span() {
+        let src = "%%
+a 'A'
+b 'B'
+[ \\n] ;"
+            .to_string();
+        let lexerdef = LRNonStreamingLexerDef::<DefaultLexeme<u8>, u8>::from_str(&src).unwrap();
+        assert_eq!(
+            lexerdef.get_rule_by_name("A").unwrap().name_span,
+            lrpar::Span::new(6, 7)
+        );
+        assert_eq!(
+            lexerdef.get_rule_by_name("B").unwrap().name_span,
+            lrpar::Span::new(12, 13)
+        );
+        let anonymous_rules = lexerdef
+            .iter_rules()
+            .filter(|rule| rule.name.is_none())
+            .collect::<Vec<_>>();
+        assert_eq!(anonymous_rules[0].name_span, lrpar::Span::new(21, 21));
+    }
 }