forked from tunnelvisionlabs/antlrworks2
/
GrammarParser.g4
738 lines (637 loc) · 19.1 KB
/
GrammarParser.g4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
/*
* [The "BSD license"]
* Copyright (c) 2012 Terence Parr
* Copyright (c) 2012 Sam Harwell
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/** An experimental ANTLR v4 grammar to parse ANTLR v4 grammars.
*/
parser grammar GrammarParser;
options {
tokenVocab = GrammarLexer;
abstract = true;
}
tokens {
COMBINED,
}
// The main entry point for parsing a V4 grammar.
//
grammarSpec
: // The grammar itself can have a documentation comment, which is the
// first terminal in the file.
//
DOC_COMMENT?
// Next we should see the type and name of the grammar file that
// we are about to parse.
//
grammarType id SEMI
// There now follows zero or more declaration sections that should
// be given to us before the rules are declared
//
// A number of things can be declared/stated before the grammar rules
// 'proper' are parsed. These include grammar imports (delegate), grammar
// options, imaginary token declarations, global scope declarations,
// and actions such as @header. In this rule we allow any number of
// these constructs in any order so that the grammar author is not
// constrained by some arbitrary order of declarations that nobody
// can remember. In the next phase of the parse, we verify that these
// constructs are valid, not repeated and so on.
prequelConstruct*
// We should now see at least one ANTLR EBNF style rule
// declaration. If the rules are missing we will let the
// semantic verification phase tell the user about it.
//
rules
modeSpec*
// And we force ANTLR to process everything it finds in the input
// stream by specifying hte need to match End Of File before the
// parse is complete.
//
EOF
;
grammarType
: LEXER GRAMMAR
| // A standalone parser specification
PARSER GRAMMAR
// A combined lexer and parser specification
| GRAMMAR
;
// This is the list of all constructs that can be declared before
// the set of rules that compose the grammar, and is invoked 0..n
// times by the grammarPrequel rule.
prequelConstruct
@version{6}
: // A list of options that affect analysis and/or code generation
optionsSpec
| // A list of grammars to which this grammar will delegate certain
// parts of the parsing sequence - a set of imported grammars
delegateGrammars
| // The declaration of any token types we need that are not already
// specified by a preceeding grammar, such as when a parser declares
// imaginary tokens with which to construct the AST, or a rewriting
// tree parser adds further imaginary tokens to ones defined in a prior
// {tree} parser.
tokensSpec
| // A list of custom channels used by the grammar
channelsSpec
| // A declaration of language target implemented constructs. All such
// action sections start with '@' and are given to the language target's
// StringTemplate group. For instance @parser::header and @lexer::header
// are gathered here.
action
;
// A list of options that affect analysis and/or code generation
optionsSpec
@version{3}
: OPTIONS (option SEMI)* RBRACE
| OPTIONS RBRACE
;
option
: id ASSIGN optionValue
;
// ------------
// Option Value
//
optionValue
: // If the option value is a single word that conforms to the
// lexical rules of token or rule names, then the user may skip quotes
// and so on. Many option values meet this description
qid
| ( // The value is a long string
STRING_LITERAL
| // The value was an integer number
INT
| // Asterisk, used for things like k=*
STAR
)
;
// A list of grammars to which this grammar will delegate certain
// parts of the parsing sequence - a set of imported grammars
delegateGrammars
: IMPORT delegateGrammar (COMMA delegateGrammar)* SEMI
;
// A possibly named grammar file that should be imported to this gramamr
// and delgated to for the rules it specifies
delegateGrammar
: id ASSIGN id
| id
;
/** The declaration of any token types we need that are not already
* specified by a preceeding grammar, such as when a parser declares
* imaginary tokens with which to construct the AST, or a rewriting
* tree parser adds further imaginary tokens to ones defined in a prior
* {tree} parser.
*/
tokensSpec
@version{1}
: TOKENS (id (COMMA id)* COMMA?)? RBRACE
;
channelsSpec
@version{6}
: CHANNELS (id (COMMA id)* COMMA?)? RBRACE
;
actionBlock
: BEGIN_ACTION
( actionBlock
| actionExpression
| actionScopeExpression
| ( ACTION_WS
| ACTION_NEWLINE
| ACTION_COMMENT
| ACTION_LITERAL
| ACTION_TEXT
| ACTION_LT
| ACTION_GT
| ACTION_LPAREN
| ACTION_RPAREN
| ACTION_LBRACK
| ACTION_RBRACK
| ACTION_EQUALS
| ACTION_COMMA
| ACTION_ESCAPE
| ACTION_WORD
| ACTION_REFERENCE
| ACTION_COLON
| ACTION_COLON2
| ACTION_MINUS
| ACTION_DOT
)
)*
END_ACTION
;
actionExpression
: ref=ACTION_REFERENCE ignored* op=ACTION_DOT ignored* member=ACTION_WORD
;
actionScopeExpression
: ref=ACTION_REFERENCE ignored* (ACTION_LBRACK ignored* (neg=ACTION_MINUS ignored*)? index=ACTION_WORD ignored* ACTION_RBRACK ignored*)? op=ACTION_COLON2 ignored* member=ACTION_WORD
;
argActionBlock
@version{1}
: BEGIN_ARG_ACTION
( ARG_ACTION_ELEMENT
| ARG_ACTION_TEXT
| ARG_ACTION_LT
| ARG_ACTION_GT
| ARG_ACTION_LPAREN
| ARG_ACTION_RPAREN
| ARG_ACTION_EQUALS
| ARG_ACTION_COMMA
| ARG_ACTION_ESCAPE
| ARG_ACTION_WORD
| ARG_ACTION_WS
| ARG_ACTION_NEWLINE
)*
END_ARG_ACTION
;
argActionParameters
: BEGIN_ARG_ACTION
ignored* (parameters+=argActionParameter ignored* (ARG_ACTION_COMMA ignored* parameters+=argActionParameter ignored*)*)?
END_ARG_ACTION
;
argActionParameter
@version{7}
: type=argActionParameterType? ignored* name=ARG_ACTION_WORD (ignored* ARG_ACTION_EQUALS ignored* argActionParameterType?)?
;
argActionParameterType
: argActionParameterTypePart (ignored* argActionParameterTypePart)*
;
argActionParameterTypePart
@version{7}
: ARG_ACTION_WORD
| ARG_ACTION_LT ignored* (argActionParameterType (ignored* ARG_ACTION_COMMA (ignored* argActionParameterType)?)* ignored*)? ARG_ACTION_GT
| ARG_ACTION_LPAREN ignored* (argActionParameterType (ignored* ARG_ACTION_COMMA (ignored* argActionParameterType)?)* ignored*)? ARG_ACTION_RPAREN
;
ignored
: ACTION_WS
| ACTION_NEWLINE
| ACTION_COMMENT
| ARG_ACTION_WS
| ARG_ACTION_NEWLINE
;
// A declaration of a language target specifc section,
// such as @header, @includes and so on. We do not verify these
// sections, they are just passed on to the language target.
/** Match stuff like @parser::members {int i;} */
action
: AT (actionScopeName COLONCOLON)? id actionBlock
;
/** Sometimes the scope names will collide with keywords; allow them as
* ids for action scopes.
*/
actionScopeName
: id
| ( LEXER
| PARSER
)
;
modeSpec
@version{3}
: MODE id SEMI ruleSpec*
;
rules
: ruleSpec*
;
ruleSpec
: parserRuleSpec
| lexerRule
;
// The specification of an EBNF rule in ANTLR style, with all the
// rule level parameters, declarations, actions, rewrite specs and so
// on.
//
// Note that here we allow any number of rule declaration sections (such
// as scope, returns, etc) in any order and we let the upcoming semantic
// verification of the AST determine if things are repeated or if a
// particular functional element is not valid in the context of the
// grammar type, such as using returns in lexer rules and so on.
parserRuleSpec
: // A rule may start with an optional documentation comment
DOC_COMMENT?
// Following the documentation, we can declare a rule to be
// public, private and so on. This is only valid for some
// language targets of course but the target will ignore these
// modifiers if they make no sense in that language.
ruleModifiers?
// Next comes the rule name. Here we do not distinguish between
// parser or lexer rules, the semantic verification phase will
// reject any rules that make no sense, such as lexer rules in
// a pure parser or tree parser.
name=RULE_REF
// Immediately following the rulename, there may be a specification
// of input parameters for the rule. We do not do anything with the
// parameters here except gather them for future phases such as
// semantic verifcation, type assignment etc. We require that
// the input parameters are the next syntactically significant element
// following the rule id.
argActionParameters?
ruleReturns?
throwsSpec?
localsSpec?
// Now, before the rule specification itself, which is introduced
// with a COLON, we may have zero or more configuration sections.
// As usual we just accept anything that is syntactically valid for
// one form of the rule or another and let the semantic verification
// phase throw out anything that is invalid.
// At the rule level, a programmer may specify a number of sections, such
// as scope declarations, rule return elements, @ sections (which may be
// language target specific) and so on. We allow any number of these in any
// order here and as usual rely onthe semantic verification phase to reject
// anything invalid using its addinotal context information. Here we are
// context free and just accept anything that is a syntactically correct
// construct.
//
rulePrequels
COLON
// The rule is, at the top level, just a list of alts, with
// finer grained structure defined within the alts.
ruleBlock
SEMI
exceptionGroup
;
// Many language targets support exceptions and the rule will
// generally be able to throw the language target equivalent
// of a recognition exception. The grammar programmar can
// specify a list of exceptions to catch or a generic catch all
// and the target language code generation template is
// responsible for generating code that makes sense.
exceptionGroup
: exceptionHandler* finallyClause?
;
// Specifies a handler for a particular type of exception
// thrown by a rule
exceptionHandler
: CATCH argActionBlock actionBlock
;
finallyClause
: FINALLY actionBlock
;
rulePrequels
: rulePrequel*
;
// An individual rule level configuration as referenced by the ruleActions
// rule above.
//
rulePrequel
: optionsSpec
| ruleAction
;
// A rule can return elements that it constructs as it executes.
// The return values are specified in a 'returns' prequel element,
// which contains COMMA separated declarations, where the declaration
// is target language specific. Here we see the returns declaration
// as a single lexical action element, to be processed later.
//
ruleReturns
: RETURNS argActionParameters
;
// --------------
// Exception spec
//
// Some target languages, such as Java and C# support exceptions
// and they are specified as a prequel element for each rule that
// wishes to throw its own exception type. Note that the name of the
// exception is just a single word, so the header section of the grammar
// must specify the correct import statements (or language equivalent).
// Target languages that do not support exceptions just safely ignore
// them.
//
throwsSpec
: THROWS qid (COMMA qid)*
;
localsSpec
: LOCALS argActionParameters
;
// @ Sections are generally target language specific things
// such as local variable declarations, code to run before the
// rule starts and so on. Fir instance most targets support the
// @init {} section where declarations and code can be placed
// to run before the rule is entered. The C target also has
// an @declarations {} section, where local variables are declared
// in order that the generated code is C89 copmliant.
//
/** Match stuff like @init {int i;} */
ruleAction
: AT id actionBlock
;
// A set of access modifiers that may be applied to rule declarations
// and which may or may not mean something to the target language.
// Note that the parser allows any number of these in any order and the
// semantic pass will throw out invalid combinations.
//
ruleModifiers
: ruleModifier+
;
// An individual access modifier for a rule. The 'fragment' modifier
// is an internal indication for lexer rules that they do not match
// from the input but are like subroutines for other lexer rules to
// reuse for certain lexical patterns. The other modifiers are passed
// to the code generation templates and may be ignored by the template
// if they are of no use in that language.
ruleModifier
: PUBLIC
| PRIVATE
| PROTECTED
| FRAGMENT
;
// A set of alts, rewritten as a BLOCK for generic processing
// in tree walkers. Used by the rule 'rule' so that the list of
// alts for a rule appears as a BLOCK containing the alts and
// can be processed by the generic BLOCK rule. Note that we
// use a separate rule so that the BLOCK node has start and stop
// boundaries set correctly by rule post processing of rewrites.
ruleBlock
: ruleAltList
;
ruleAltList
: labeledAlt (OR labeledAlt)*
;
labeledAlt
@version{1}
: alternative (POUND id)?
;
lexerRule
: DOC_COMMENT? FRAGMENT?
name=TOKEN_REF COLON lexerRuleBlock SEMI
;
lexerRuleBlock
: lexerAltList
;
lexerAltList
: lexerAlt (OR lexerAlt)*
;
lexerAlt
@version{3}
: lexerElements? lexerCommands?
;
lexerElements
: lexerElement+
;
lexerElement
: labeledLexerElement ebnfSuffix?
| lexerAtom ebnfSuffix?
| lexerBlock ebnfSuffix?
| actionBlock QUESTION? // actions only allowed at end of outer alt actually,
// but preds can be anywhere
;
labeledLexerElement
: id ass=(ASSIGN|PLUS_ASSIGN)
( lexerAtom
| block
)
;
lexerBlock
@version{1}
: LPAREN (optionsSpec COLON)? lexerAltList RPAREN
;
// channel=HIDDEN, skip, more, mode(INSIDE), push(INSIDE), pop
lexerCommands
: RARROW lexerCommand (COMMA lexerCommand)*
;
lexerCommand
@version{1}
: lexerCommandName LPAREN lexerCommandExpr RPAREN
| lexerCommandName
;
lexerCommandName
: id
| MODE
;
lexerCommandExpr
@version{1}
: id
| INT
;
altList
: alternative (OR alternative)*
;
// An individual alt with an optional rewrite clause for the
// elements of the alt.
alternative
@version{5}
: elementOptions?
( elements
| // empty alt
)
;
elements
: element+
;
element
@version{5}
: labeledElement
( ebnfSuffix
|
)
| atom
( ebnfSuffix
|
)
| ebnf
| actionBlock QUESTION? elementOptions? // SEMPRED is actionBlock followed by QUESTION
;
labeledElement
: label=id ass=(ASSIGN|PLUS_ASSIGN)
( atom
| block
)
;
// A block of gramamr structure optionally followed by standard EBNF
// notation, or ANTLR specific notation. I.E. ? + ^ and so on
ebnf
: block
// And now we see if we have any of the optional suffixs and rewrite
// the AST for this rule accordingly
( blockSuffix
|
)
;
// The standard EBNF suffixes with additional components that make
// sense only to ANTLR, in the context of a grammar block.
blockSuffix
: ebnfSuffix // Standard EBNF
;
ebnfSuffix
@version{1}
: QUESTION QUESTION?
| STAR QUESTION?
| PLUS QUESTION?
;
lexerAtom
@version{1}
: range
| terminal
| RULE_REF
| notSet
| LEXER_CHAR_SET
| // Wildcard '.' means any character in a lexer, any
// token in parser and any node or subtree in a tree parser
// Because the terminal rule is allowed to be the node
// specification for the start of a tree rule, we must
// later check that wildcard was not used for that.
DOT elementOptions?
;
atom
: // Qualified reference delegate.rule. This must be
// lexically contiguous (no spaces either side of the DOT)
// otherwise it is two references with a wildcard in between
// and not a qualified reference.
range // Range x..y - only valid in lexers
| terminal
| ruleref
| notSet
| // Wildcard '.' means any character in a lexer, any
// token in parser and any node or subtree in a tree parser
// Because the terminal rule is allowed to be the node
// specification for the start of a tree rule, we must
// later check that wildcard was not used for that.
DOT elementOptions?
;
// --------------------
// Inverted element set
//
// A set of characters (in a lexer) or terminal tokens, if a parser,
// that are then used to create the inverse set of them.
notSet
: NOT setElement
| NOT blockSet
;
blockSet
: LPAREN setElement (OR setElement)* RPAREN
;
setElement
@version{4}
: ( TOKEN_REF
| STRING_LITERAL
| LEXER_CHAR_SET
)
elementOptions?
| range
;
// -------------
// Grammar Block
//
// Anywhere where an element is valid, the grammar may start a new block
// of alts by surrounding that block with ( ). A new block may also have a set
// of options, which apply only to that block.
//
block
: LPAREN
( optionsSpec? ruleAction* COLON )?
altList
RPAREN
;
// ----------------
// Parser rule ref
//
// Reference to a parser rule with optional arguments and optional
// directive to become the root node or ignore the tree produced
//
ruleref
@version{5}
: RULE_REF argActionBlock? elementOptions?
;
// ---------------
// Character Range
//
// Specifies a range of characters. Valid for lexer rules only, but
// we do not check that here, the tree walkers shoudl do that.
// Note also that the parser also allows through more than just
// character literals so that we can produce a much nicer semantic
// error about any abuse of the .. operator.
//
range
: STRING_LITERAL RANGE STRING_LITERAL
;
terminal
: TOKEN_REF elementOptions?
| STRING_LITERAL elementOptions?
;
// Terminals may be adorned with certain options when
// reference in the grammar: TOK<,,,>
elementOptions
@version{3}
: LT (elementOption (COMMA elementOption)*) GT
;
// WHen used with elements we can specify what the tree node type can
// be and also assign settings of various options (which we do not check here)
elementOption
: // This format indicates the default node option
qid
| // This format indicates option assignment
id ASSIGN (qid | STRING_LITERAL)
;
// The name of the grammar, and indeed some other grammar elements may
// come through to the parser looking like a rule reference or a token
// reference, hence this rule is used to pick up whichever it is and rewrite
// it as a generic ID token.
id
@version{1}
: RULE_REF
| TOKEN_REF
;
qid
: id (DOT id)*
;