Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 239 lines (192 sloc) 8.508 kB
74ebe2b @skeeto Split off rdp into it's own repository.
authored
1 ;;; rdp.el --- Recursive Descent Parser library
2
3 ;; This is free and unencumbered software released into the public domain.
4
5 ;; Author: Christopher Wellons <mosquitopsu@gmail.com>
4204621 @skeeto Fix repository URL.
authored
6 ;; URL: https://github.com/skeeto/rdp
74ebe2b @skeeto Split off rdp into it's own repository.
authored
7 ;; Version: 1.0
8
9 ;;; Commentary:
10
11 ;; This library provides a recursive descent parser for parsing
12 ;; languages in buffers. Some support is provided for implementing
13 ;; automatic indentation based on the parser.
14
15 ;; In general, the only two functions you need to worry about are:
16
81c453e @skeeto Oops, fix typos pointed out by csar.
authored
17 ;; * `rdp-parse' -- parse the current buffer
18 ;; * `rdp-parse-string' -- parse a string (in a temp buffer)
74ebe2b @skeeto Split off rdp into it's own repository.
authored
19
20 ;; A grammar is provided to the parser as an alist of patterns.
21 ;; Patterns are named by symbols, which can reference other
22 ;; patterns. The lisp object type indicates the type of the pattern:
23
24 ;; * string -- an Emacs regular expression
25 ;; * list -- "and" relationship, each pattern must match in order
26 ;; * vector -- "or" relationship, one of the patterns must match
27 ;; * symbol -- recursive reference to another pattern in the alist
28
29 ;; The global variable `rdp-best' indicates the furthest point reached
fd99a2e @skeeto Darnit, fix another typo.
authored
30 ;; in the buffer by the parser. If parsing failed (i.e. `rdp-best' is
74ebe2b @skeeto Split off rdp into it's own repository.
authored
31 ;; not at the end of the buffer), this is likely to be the position of
32 ;; the syntax error.
33
34 ;; For example, this grammar parses simple arithmetic with operator
35 ;; precedence and grouping.
36
37 ;; (defvar arith-tokens
38 ;; '((sum prod [([+ -] sum) no-sum])
39 ;; (prod value [([* /] prod) no-prod])
40 ;; (num . "-?[0-9]+\\(\\.[0-9]*\\)?")
41 ;; (+ . "\\+")
42 ;; (- . "-")
43 ;; (* . "\\*")
44 ;; (/ . "/")
45 ;; (pexpr "(" [sum prod num pexpr] ")")
46 ;; (value . [pexpr num])
47 ;; (no-prod . "")
48 ;; (no-sum . "")))
49
50 ;; Given just this grammar to `rdp-parse' it will return an
51 ;; s-expression of the input where each token match is `cons'ed with
52 ;; the token name. To make this more useful, the s-expression can be
53 ;; manipulated as it is read using an alist of token names and
54 ;; functions. This could be used to simplify the s-expression, build
55 ;; an interpreter that interprets during parsing, or even build a
56 ;; compiler.
57
58 ;; For example, this function alist evaluates the arithmetic as it is
59 ;; parsed:
60
61 ;; (defun arith-op (expr)
62 ;; (destructuring-bind (a (op b)) expr
63 ;; (funcall op a b)))
64 ;;
65 ;; (defvar arith-funcs
66 ;; `((sum . ,#'arith-op)
67 ;; (prod . ,#'arith-op)
68 ;; (num . ,#'string-to-number)
69 ;; (+ . ,#'intern)
70 ;; (- . ,#'intern)
71 ;; (* . ,#'intern)
72 ;; (/ . ,#'intern)
73 ;; (pexpr . ,#'cadr)
74 ;; (value . ,#'identity)
75 ;; (no-prod . ,(lambda (e) '(* 1)))
76 ;; (no-sum . ,(lambda (e) '(+ 0)))))
77
78 ;; Putting this all together:
79
80 ;; (defun arith (string)
81 ;; (rdp-parse-string string arith-tokens arith-funcs))
82 ;;
83 ;; (arith "(1 + 2 + 3 + 4 + 5) * -3/4.0")
84
85 ;; Tips:
86
87 ;; Recursive descent parsers *cannot* be left-recursive. It is
88 ;; important that a pattern does not recurse without first consuming
89 ;; some input. Any grammar can be made non-left-recursive but not
90 ;; necessarily simplistically.
91
92 ;; The parser requires a lot of stack! Consider increasing
93 ;; `max-lisp-eval-depth' by some factor before calling
94 ;; `rdp-parse'. After increasing it, running out of stack space is
95 ;; likely an indication of left-recursion somewhere in the grammar.
96
97 ;; Token functions should not have side effects. Due to the
98 ;; backtracking of the parser, just because the function was called
99 ;; doesn't mean there was actually a successful match. Also, these
100 ;; functions are free to return nil or the empty list as such a return
101 ;; is *not* an indication of failure.
102
b620192 @skeeto Add whitespace note.
authored
103 ;; By default, whitespace is automatically consumed between matches
104 ;; using the function `rdp-skip-whitespace'. If some kinds of
105 ;; whitespace are important or if there are other characters that need
106 ;; to be skipped, temporarily override this function with your own
107 ;; definition using `flet' when calling `rdp-parse'.
108
74ebe2b @skeeto Split off rdp into it's own repository.
authored
109 ;; In general don't try to parse comments in the grammar. Strip them
110 ;; from the buffer before calling the parser.
111
112 ;; Indentation facilities:
113
114 ;; To find out where in the parse tree a point lies, set `rdp-start'
115 ;; to the desired point before starting parsing. After parsing, either
116 ;; successfully or not,`rdp-point-stack' will contain a stack of
117 ;; tokens indicating roughly where in the parse tree the point
118 ;; lies.
119
81c453e @skeeto Oops, fix typos pointed out by csar.
authored
120 ;; To use this for rudimentary indentation, set `rdp-start' to the
74ebe2b @skeeto Split off rdp into it's own repository.
authored
121 ;; `beginning-of-line' of the current point and count how many
122 ;; indent-worthy tokens are in the stack once parsing is complete.
123
124 ;; See also:
125
126 ;; * http://emacswiki.org/emacs/peg.el
b8a0699 @skeeto Add a couple more links to "see also" in the header.
authored
127 ;; * http://www.gnu.org/software/emacs/manual/html_node/elisp/SMIE.html
128 ;; * http://cedet.sourceforge.net/semantic.shtml
74ebe2b @skeeto Split off rdp into it's own repository.
authored
129 ;; * http://en.wikipedia.org/wiki/Recursive_descent_parser
130 ;; * http://en.wikipedia.org/wiki/Parsing_expression_grammar
131
132 ;;; Code:
133
2d22624 @skeeto Add cl as a compile-time macro.
authored
134 (eval-when-compile (require 'cl))
135
74ebe2b @skeeto Split off rdp into it's own repository.
authored
136 (defvar rdp-best 0
137 "The furthest most point that parsing reached. This information
138 can be used to determine where parsing failed.")
139
140 (defvar rdp-start 0
141 "Position of point in original source buffer. The purpose is
142 for auto-indentation.")
143
144 (defvar rdp-point-stack ()
145 "The token stack that contains the point. This is used for
146 auto-indentation.")
147
148 (defvar rdp-token-stack ()
149 "Stack of tokens at this point.")
150
151 (defun rdp-box (value)
152 "Box a parse return value, allowing nil to be a valid return."
153 (vector value))
154
155 (defun rdp-unbox (box)
156 "Unbox a parse return value."
157 (aref box 0))
158
159 (defun rdp-get-token-func (token funcs)
160 "Get the manipulation function for the given token."
161 (cdr (assq token funcs)))
162
163 (defun rdp-parse (tokens &optional funcs pattern)
164 "Return the next item in the current buffer."
165 (setq rdp-best 0)
166 (setq rdp-token-stack ())
167 (if pattern
168 (rdp-unbox (rdp-match pattern tokens funcs))
169 (dolist (token tokens)
170 (let ((result (rdp-match (car token) tokens funcs)))
171 (if result (return (rdp-unbox result)))))))
172
173 (defun rdp-parse-string (string tokens &optional funcs pattern)
174 "Like `rdp-parse' but operates on a string."
175 (with-temp-buffer
176 (insert string)
177 (goto-char (point-min))
178 (rdp-parse tokens funcs pattern)))
179
180 (defun rdp-match-list (list tokens funcs)
181 "Match all patterns in a list."
182 (let ((result (rdp-match (car list) tokens funcs)))
183 (when result
184 (if (null (cdr list))
185 (rdp-box (list (rdp-unbox result)))
186 (let ((rest (rdp-match-list (cdr list) tokens funcs)))
187 (when rest
188 (rdp-box (cons (rdp-unbox result) (rdp-unbox rest)))))))))
189
190 (defun rdp-match-regex (regex tokens funcs)
191 "Match a regex."
192 (when (looking-at regex)
193 (prog1 (rdp-box (buffer-substring-no-properties (point) (match-end 0)))
194 (goto-char (match-end 0)))))
195
196 (defun rdp-match-token (token tokens funcs)
197 "Match a token by name (symbol)."
198 (push token rdp-token-stack)
199 (let* ((pattern (cdr (assq token tokens)))
200 (match (rdp-match pattern tokens funcs)))
201 (pop rdp-token-stack)
202 (when match
203 (let ((macro (rdp-get-token-func token funcs)))
204 (rdp-box (if macro
205 (funcall macro (rdp-unbox match))
206 (cons token (rdp-unbox match))))))))
207
208 (defun rdp-match-or (vec tokens funcs)
209 "Match at least one pattern in the vector."
210 (dolist (option (mapcar 'identity vec))
211 (let ((match (rdp-match option tokens funcs)))
212 (when match (return match)))))
213
214 (defun rdp-skip-whitespace ()
215 "Skip over all whitespace."
216 (search-forward-regexp "[[:space:]]*"))
217
218 (defun rdp-match (pattern tokens &optional funcs)
219 "Match the given pattern object of any type (toplevel)."
220 (rdp-skip-whitespace)
221 (let ((start (point))
222 (result (etypecase pattern
223 (string (rdp-match-regex pattern tokens funcs))
224 (list (rdp-match-list pattern tokens funcs))
225 (symbol (rdp-match-token pattern tokens funcs))
226 (vector (rdp-match-or pattern tokens funcs)))))
227 (when (and (<= (length rdp-point-stack) (length rdp-token-stack))
228 (> rdp-start start)
229 (> (point) rdp-start))
230 (setq rdp-point-stack (reverse rdp-token-stack)))
231 (unless result
232 (setq rdp-best (max rdp-best (point)))
233 (goto-char start))
234 result))
235
236 (provide 'rdp)
237
238 ;;; rdp.el ends here
Something went wrong with that request. Please try again.