From 2045d5fbfa3ed857a9eac3722e6f9ecc301593c6 Mon Sep 17 00:00:00 2001 From: Hamish Friedlander Date: Thu, 10 Mar 2011 12:00:49 +1300 Subject: [PATCH] ENHANCEMENT: Add the ability to extend rules from other rules, specify expression values as arguments, clear up expressions, make bare expressions mean dynamic match not raw php calling, and add some basic tests --- Compiler.php | 331 +++++++++++++++++++++----------- Parser.php | 143 ++++++-------- README.md | 155 ++++++++++++--- tests/ParserInheritanceTest.php | 123 ++++++++++++ tests/ParserSyntaxTest.php | 26 +++ tests/ParserTestBase.php | 48 +++++ tests/ParserVariablesTest.php | 55 ++++++ 7 files changed, 650 insertions(+), 231 deletions(-) create mode 100644 tests/ParserInheritanceTest.php create mode 100644 tests/ParserSyntaxTest.php create mode 100644 tests/ParserTestBase.php create mode 100644 tests/ParserVariablesTest.php diff --git a/Compiler.php b/Compiler.php index 2acd72d..5d34482 100644 --- a/Compiler.php +++ b/Compiler.php @@ -215,22 +215,19 @@ function compile() { } if ( $this->tag && !($this instanceof TokenRecurse ) ) { - $rid = $this->varid() ; $code = PHPBuilder::build() ->l( - '$substack[] = $result;', - '$result = $this->construct( "'.$this->tag.'" );', + '$stack[] = $result; $result = $this->construct( $matchrule, "'.$this->tag.'" ); ', $code->replace(array( 'MATCH' => PHPBuilder::build() ->l( - '$subres = $result ;', - '$result = array_pop( $substack ) ;', + '$subres = $result; $result = array_pop($stack);', '$this->store( $result, $subres, \''.$this->tag.'\' );', 'MATCH' ), 'FAIL' => PHPBuilder::build() ->l( - '$result = array_pop( $substack ) ;', + '$result = array_pop($stack);', 'FAIL' ) ))); @@ -255,31 +252,30 @@ protected function match_code( $value ) { abstract class TokenExpressionable extends TokenTerminal { - static $expression_rx = '/\$(\w+)/' ; + static $expression_rx = '/ \$(\w+) | { \$(\w+) } /x'; function contains_expression(){ return preg_match(self::$expression_rx, $this->value); } + function expression_replace($matches) { + return '\'.$this->expression($result, $stack, \'' . (!empty($matches[1]) ? $matches[1] : $matches[2]) . "').'"; + } + function match_code( $value ) { - if (!$this->contains_expression()) parent::match_code($value); - - $id = $this->varid() ; - return PHPBuilder::build()->l( - '$'.$id.' = new ParserExpression( $this, $substack, $result );', - parent::match_code('$'.$id.'->expand('.$value.')') - ); + $value = preg_replace_callback(self::$expression_rx, array($this, 'expression_replace'), $value); + return parent::match_code($value); } } class TokenLiteral extends TokenExpressionable { function __construct( $value ) { - parent::__construct( 'literal', $value ); + parent::__construct( 'literal', "'" . substr($value,1,-1) . "'" ); } function match_code() { // We inline single-character matches for speed - if ( strlen( eval( 'return '. $this->value . ';' ) ) == 1 ) { + if ( !$this->contains_expression() && strlen( eval( 'return '. $this->value . ';' ) ) == 1 ) { return $this->match_fail_conditional( 'substr($this->string,$this->pos,1) == '.$this->value, PHPBuilder::build()->l( '$this->pos += 1;', @@ -319,37 +315,18 @@ function match_code() { } } -class TokenPHP extends TokenTerminal { - function __construct( $value ) { - parent::__construct( 'php', $value ) ; - } - - /* Call recursion indirectly */ - function match_code() { - $id = $this->varid() ; - return PHPBuilder::build() - ->l( - '$'.$id.' = new ParserExpression( $this, $substack, $result );', - $this->match_fail_block( '( $subres = $'.$id.'->match( \''.$this->value.'\' ) ) !== FALSE', - PHPBuilder::build() - ->b( 'if ( is_string( $subres ) )', - $this->set_text('$subres') - ) - ->b( 'else', - '$this->store($result, $subres);' - ) - )); - } -} - class TokenRecurse extends Token { function __construct( $value ) { parent::__construct( 'recurse', $value ) ; } + function match_function() { + return "'".$this->function_name($this->value)."'"; + } + function match_code() { - $function = $this->function_name( $this->value ) ; - $storetag = $this->function_name( $this->tag ? $this->tag : $this->value ) ; + $function = $this->match_function() ; + $storetag = $this->function_name( $this->tag ? $this->tag : $this->match_function() ) ; if ( ParserCompiler::$debug ) { $debug_header = PHPBuilder::build() @@ -358,7 +335,7 @@ function match_code() { '$this->depth += 2;', '$sub = ( strlen( $this->string ) - $this->pos > 20 ) ? ( substr( $this->string, $this->pos, 20 ) . "..." ) : substr( $this->string, $this->pos );', '$sub = preg_replace( \'/(\r|\n)+/\', " {NL} ", $sub );', - 'print( $indent."Matching against '.$function.' (".$sub.")\n" );' + 'print( $indent."Matching against $matcher (".$sub.")\n" );' ); $debug_match = PHPBuilder::build() @@ -378,9 +355,9 @@ function match_code() { } return PHPBuilder::build()->l( + '$matcher = \'match_\'.'.$function.'; $key = $matcher; $pos = $this->pos;', $debug_header, - '$key = "'.$function.'"; $pos = $this->pos;', // :{$this->pos}";', - '$subres = ( $this->packhas( $key, $pos ) ? $this->packread( $key, $pos ) : $this->packwrite( $key, $pos, $this->match_'.$function.'(array_merge($substack, array($result))) ) );', + '$subres = ( $this->packhas( $key, $pos ) ? $this->packread( $key, $pos ) : $this->packwrite( $key, $pos, $this->$matcher(array_merge($stack, array($result))) ) );', $this->match_fail_conditional( '$subres !== FALSE', PHPBuilder::build()->l( $debug_match, @@ -395,6 +372,12 @@ function match_code() { } } +class TokenExpressionedRecurse extends TokenRecurse { + function match_function() { + return '$this->expression($result, $stack, \''.$this->value.'\')'; + } +} + class TokenSequence extends Token { function __construct( $value ) { parent::__construct( 'sequence', $value ) ; @@ -479,40 +462,115 @@ function apply_if_present( $on ) { */ class Rule extends PHPWriter { - static $rule_rx = '@^[\x20\t]+(.*)@' ; - static $func_rx = '@^[\x20\t]+function\s+([^\s(]+)\s*\(([^)]*)\)@' ; + static $rule_rx = '@ + (? \w+) # The name of the rule + ( \s+ extends \s+ (?\w+) )? # The extends word + ( \s* \( (?.*) \) )? # Any variable setters + ( + \s*(?:) | # Marks the matching rule start + \s*(?;) | # Marks the replacing rule start + \s*$ + ) + (?[\s\S]*) + @x'; + + static $argument_rx = '@ + ( [^=]+ ) # Name + = # Seperator + ( [^=,]+ ) # Variable + (,|$) + @x'; + + static $replacement_rx = '@ + ( ([^=]|=[^>])+ ) # What to replace + => # The replacement mark + ( [^,]+ ) # What to replace it with + (,|$) + @x'; + + static $function_rx = '@^\s+function\s+([^\s(]+)\s*(.*)@' ; - function __construct( $indent, $rules, $match ) { - $this->indent = $indent; - $this->name = $match[1][0] ; - $this->rule = $match[2][0] ; + protected $parser; + protected $lines; + + public $name; + public $extends; + public $mode; + public $rule; + + function __construct($parser, $lines) { + $this->parser = $parser; + $this->lines = $lines; + + // Find the first line (if any) that's an attached function definition. Can skip first line (unless this block is malformed) + for ($i = 1; $i < count($lines); $i++) { + if (preg_match(self::$function_rx, $lines[$i])) break; + } + + // Then split into the two parts + $spec = array_slice($lines, 0, $i); + $funcs = array_slice($lines, $i); + + // Parse out the spec + $spec = implode("\n", $spec); + if (!preg_match(self::$rule_rx, $spec, $specmatch)) user_error('Malformed rule spec ' . $spec, E_USER_ERROR); + + $this->name = $specmatch['name']; + + if ($specmatch['extends']) { + $this->extends = $this->parser->rules[$specmatch['extends']]; + if (!$this->extends) user_error('Extended rule '.$specmatch['extends'].' is not defined before being extended', E_USER_ERROR); + } + + $this->arguments = array(); + + if ($specmatch['arguments']) { + preg_match_all(self::$argument_rx, $specmatch['arguments'], $arguments, PREG_SET_ORDER); + + foreach ($arguments as $argument){ + $this->arguments[trim($argument[1])] = trim($argument[2]); + } + } + + $this->mode = $specmatch['matchmark'] ? 'rule' : 'replace'; + + if ($this->mode == 'rule') { + $this->rule = $specmatch['rule']; + $this->parse_rule() ; + } + else { + if (!$this->extends) user_error('Replace matcher, but not on an extends rule', E_USER_ERROR); + + $this->replacements = array(); + preg_match_all(self::$replacement_rx, $specmatch['rule'], $replacements, PREG_SET_ORDER); + + $rule = $this->extends->rule; + + foreach ($replacements as $replacement) { + $search = trim($replacement[1]); + $replace = trim($replacement[3]); if ($replace == "''" || $replace == '""') $replace = ""; + + $rule = str_replace($search, ' '.$replace.' ', $rule); + } + + $this->rule = $rule; + $this->parse_rule() ; + } + + // Parse out the functions + $this->functions = array() ; $active_function = NULL ; - /* Find all the lines following the rule start which are indented */ - $offset = $match[0][1] + strlen( $match[0][0] ) ; - $lines = preg_split( '/\r\n|\r|\n/', substr( $rules, $offset ) ) ; - - $rule_rx = '@^'.preg_quote($indent).'[\x20\t]+(.*)@' ; - $func_rx = '@^'.preg_quote($indent).'[\x20\t]+function\s+([^\s(]+)\s*\(([^)]*)\)@' ; - - foreach( $lines as $line ) { - if ( !trim( $line ) ) continue ; - if ( !preg_match( $rule_rx, $line, $match ) ) break ; - + foreach( $funcs as $line ) { /* Handle function definitions */ - if ( preg_match( $func_rx, $line, $func_match, 0 ) ) { - $active_function = $func_match[1] ; - $this->functions[$active_function] = array( $func_match[2], "" ) ; - } - else { - if ( $active_function ) $this->functions[$active_function][1] .= $line . PHP_EOL ; - else $this->rule .= PHP_EOL . trim($line) ; + if ( preg_match( self::$function_rx, $line, $func_match, 0 ) ) { + $active_function = $func_match[1]; + $this->functions[$active_function] = $func_match[2] . PHP_EOL; } + else $this->functions[$active_function] .= $line . PHP_EOL ; } - - $this->parse_rule() ; } /* Manual parsing, because we can't bootstrap ourselves yet */ @@ -528,6 +586,7 @@ function parse_rule() { $this->tokenize( $rule, $tokens ) ; $this->parsed = ( count( $tokens ) == 1 ? array_pop( $tokens ) : new TokenSequence( $tokens ) ) ; } + } static $rx_rx = '{^/( @@ -574,7 +633,7 @@ function tokenize( $str, &$tokens, $o = 0 ) { } /* Handle $ call literals */ elseif ( preg_match( '/^\$(\w+)/', $sub, $match ) ) { - $tokens[] = $t = new TokenPHP( $match[1] ) ; $pending->apply_if_present( $t ) ; + $tokens[] = $t = new TokenExpressionedRecurse( $match[1] ) ; $pending->apply_if_present( $t ) ; $o += strlen( $match[0] ) ; } /* Handle flags */ @@ -652,46 +711,105 @@ function tokenize( $str, &$tokens, $o = 0 ) { /** * Generate the PHP code for a function to match against a string for this rule */ - function compile() { + function compile($indent) { $function_name = $this->function_name( $this->name ) ; + + // Build the typestack + $typestack = array(); $class=$this; + do { + $typestack[] = $this->function_name($class->name); + } + while($class = $class->extends); - $match = PHPBuilder::build() ; - - if ( $this->parsed instanceof TokenRegex ) { - $match->b( "function match_{$function_name} (\$substack = array())", - '$result = array("name"=>"'.$function_name.'", "text"=>"");', - $this->parsed->compile()->replace(array( - 'MATCH' => 'return $result;', - 'FAIL' => 'return FALSE;' - )) - ); + $typestack = "array('" . implode("','", $typestack) . "')"; + + // Build an array of additional arguments to add to result node (if any) + if (empty($this->arguments)) { + $arguments = 'null'; } else { - $match->b( "function match_{$function_name} (\$substack = array())", - '$result = $this->construct( "'.$function_name.'" );', - $this->parsed->compile()->replace(array( - 'MATCH' => 'return $this->finalise( "'.$function_name.'", $result );', - 'FAIL' => 'return FALSE;' - )) - ); + $arguments = "array("; + foreach ($this->arguments as $k=>$v) { $arguments .= "'$k' => '$v'"; } + $arguments .= ")"; } + + $match = PHPBuilder::build() ; + + $match->l("protected \$match_{$function_name}_typestack = $typestack;"); + + $match->b( "function match_{$function_name} (\$stack = array())", + '$matchrule = "'.$function_name.'"; $result = $this->construct($matchrule, $matchrule, '.$arguments.');', + $this->parsed->compile()->replace(array( + 'MATCH' => 'return $this->finalise($result);', + 'FAIL' => 'return FALSE;' + )) + ); $functions = array() ; foreach( $this->functions as $name => $function ) { $function_name = $this->function_name( preg_match( '/^_/', $name ) ? $this->name.$name : $this->name.'_'.$name ) ; $functions[] = implode( PHP_EOL, array( - 'function ' . $function_name . ' ( ' . $function[0] . ' ) { ', - $function[1], + 'function ' . $function_name . ' ' . $function )); } // print_r( $match ) ; return '' ; - return $match->render(NULL, $this->indent) . PHP_EOL . PHP_EOL . implode( PHP_EOL, $functions ) ; + return $match->render(NULL, $indent) . PHP_EOL . PHP_EOL . implode( PHP_EOL, $functions ) ; + } +} + +class RuleSet { + public $rules = array(); + + function addRule($indent, $lines, &$out) { + $rule = new Rule($this, $lines) ; + $this->rules[$rule->name] = $rule; + + $out[] = $indent . '/* ' . $rule->name . ':' . $rule->rule . ' */' . PHP_EOL ; + $out[] = $rule->compile($indent) ; + $out[] = PHP_EOL ; + } + + function compile($indent, $rulestr) { + $indentrx = '@^'.preg_quote($indent).'@'; + + $out = array(); + $block = array(); + + foreach (preg_split('/\r\n|\r|\n/', $rulestr) as $line) { + // Ignore blank lines + if (!trim($line)) continue; + // Ignore comments + if (preg_match('/^[\x20|\t]+#/', $line)) continue; + + // Strip off indent + if (!empty($indent)) { + if (strpos($line, $indent) === 0) $line = substr($line, strlen($indent)); + else user_error('Non-blank line with inconsistent index in parser block', E_USER_ERROR); + } + + // Any indented line, add to current set of lines + if (preg_match('/^\x20|\t/', $line)) $block[] = $line; + + // Any non-indented line marks a new block. Add a rule for the current block, then start a new block + else { + if (count($block)) $this->addRule($indent, $block, $out); + $block = array($line); + } + } + + // Any unfinished block add a rule for + if (count($block)) $this->addRule($indent, $block, $out); + + // And return the compiled version + return implode( '', $out ) ; } } class ParserCompiler { + static $parsers = array(); + static $debug = false; static $currentClass = null; @@ -700,17 +818,11 @@ static function create_parser( $match ) { /* We allow indenting of the whole rule block, but only to the level of the comment start's indent */ $indent = $match[1]; - /* The regex to match a rule */ - $rx = '@^'.preg_quote($indent).'([\w\-]+):(.*)$@m' ; - - /* Class isn't actually used ATM. Eventually it might be used for rule inlineing optimization */ + /* Get the parser name for this block */ if ($class = trim($match[2])) self::$currentClass = $class; elseif (self::$currentClass) $class = self::$currentClass; else $class = self::$currentClass = 'Anonymous Parser'; - /* Get the actual body of the parser rule set */ - $rulestr = $match[3] ; - /* Check for pragmas */ if (strpos($class, '!') === 0) { switch ($class) { @@ -718,7 +830,7 @@ static function create_parser( $match ) { // NOP - dont output return ''; case '!insert_autogen_warning': - return $ident . implode(PHP_EOL.$ident, array( + return $indent . implode(PHP_EOL.$indent, array( '/*', 'WARNING: This file has been machine generated. Do not edit it, or your changes will be overwritten next time it is compiled.', '*/' @@ -731,22 +843,9 @@ static function create_parser( $match ) { throw new Exception("Unknown pragma $class encountered when compiling parser"); } - $rules = array(); - - preg_match_all( $rx, $rulestr, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ) ; - foreach ( $matches as $match ) { - $rules[] = new Rule( $indent, $rulestr, $match ) ; - } - - $out = array() ; - - foreach ( $rules as $rule ) { - $out[] = $indent . '/* ' . $rule->name . ':' . $rule->rule . ' */' . PHP_EOL ; - $out[] = $rule->compile() ; - $out[] = PHP_EOL ; - } - - return implode( '', $out ) ; + if (!isset(self::$parsers[$class])) self::$parsers[$class] = new RuleSet(); + + return self::$parsers[$class]->compile($indent, $match[3]); } static function compile( $string ) { diff --git a/Parser.php b/Parser.php index acae096..364f00e 100644 --- a/Parser.php +++ b/Parser.php @@ -1,59 +1,5 @@ parser = $parser ; - $this->substack = $substack ; - $this->result = $result ; - } - - function find( $exp ) { - $rule_callback = array( $this->parser, "{$this->result['name']}_DLR{$exp}" ) ; - $pars_callback = array( $this->parser, "DLR{$exp}" ) ; - - /* If the current result has that expression, return it */ - if ( isset( $this->result[$exp] ) ) return $this->result[$exp] ; - - /* Search backwards through the sub-expression stacks */ - for ( $i = count( $this->substack ) - 1 ; $i >= 0 ; $i-- ) { - if ( isset( $this->substack[$i][$exp] ) ) return $this->substack[$i][$exp] ; - } - - /* If we have a rule-attached method, call that */ - if ( is_callable( $rule_callback ) ) return call_user_func( $rule_callback, $result ) ; - - /* If we have a class-wide method, call that */ - if ( is_callable( $pars_callback ) ) return call_user_func( $pars_callback, $result ) ; - - /* If we have a global function, call that */ - if ( function_exists( $exp ) ) return call_user_func( $exp, $result ) ; - - /* If we have a global constant, call that */ - if ( defined( $exp ) ) return constant( $expression ) ; - - return FALSE ; - } - - function callback( $m ) { - $res = $this->find( $m[1] ) ; - if ( $res === FALSE ) return "" ; - if ( is_string( $res ) ) return $res ; - if ( isset( $res['text'] ) ) return $res['text'] ; - - // If we find no matches, assume we don't want a replacement, and replace it with itself - return $m[0] ; - } - - function expand( $var ) { - return preg_replace_callback( '/\$(\w+)/', array( $this, 'callback' ), $var ) ; - } - - function match( $var ) { - return $this->find( $var ) ; - } -} - /** * We cache the last regex result. This is a low-cost optimization, because we have to do an un-anchored match + check match position anyway * (alternative is to do an anchored match on a string cut with substr, but that is very slow for long strings). We then don't need to recheck @@ -133,23 +79,27 @@ function rx( $rx ) { return $this->regexps[$rx]->match() ; } - function expand( $var, $substack, $result ) { - $cb = new Parser_ExpressionCallback( $this, $substack, $result ) ; - $v = preg_replace_callback( '/\$(\w+)/', array( $cb, 'callback' ), $var ) ; - print "Expanded var: $v" ; - return $v ; - } + function expression( $result, $stack, $value ) { + $stack[] = $result; $rv = false; + + /* Search backwards through the sub-expression stacks */ + for ( $i = count($stack) - 1 ; $i >= 0 ; $i-- ) { + $node = $stack[$i]; + + if ( isset($node[$value]) ) { $rv = $node[$value]; break; } + + foreach ($this->typestack($node['_matchrule']) as $type) { + $callback = array($this, "{$type}_DLR{$value}"); + if ( is_callable( $callback ) ) { $rv = call_user_func( $callback ) ; if ($rv !== FALSE) break; } + } + } - function php( $var, $substack, $result ) { - $ex = $this->get_expression( $var, $substack, $result ) ; - print_r( $result ) ; + if ($rv === false) $rv = @$this->$value; + if ($rv === false) $rv = @$this->$value(); - if ( is_string( $ex ) ) { - return ( preg_match( '{^\s*/}', $ex ) ? $this->rx( $ex ) : $this->literal( $ex ) ) ; - } - return $ex ; + return is_array($rv) ? $rv['text'] : ($rv ? $rv : ''); } - + function packhas( $key, $pos ) { return false ; } @@ -162,21 +112,33 @@ function packwrite( $key, $pos, $res ) { return $res ; } - function construct( $name ) { - $result = array( 'type' => 'node', 'name' => $name, 'text' => '' ) ; - - $callback = array( $this, "{$name}__construct" ) ; - if ( is_callable( $callback ) ) { - call_user_func_array( $callback, array( &$result ) ) ; + function typestack( $name ) { + $prop = "match_{$name}_typestack"; + return $this->$prop; + } + + function construct( $matchrule, $name, $arguments = null ) { + $result = array( '_matchrule' => $matchrule, 'name' => $name, 'text' => '' ); + if ($arguments) $result = array_merge($result, $arguments) ; + + foreach ($this->typestack($matchrule) as $type) { + $callback = array( $this, "{$type}__construct" ) ; + if ( is_callable( $callback ) ) { + call_user_func_array( $callback, array( &$result ) ) ; + break; + } } return $result ; } - function finalise( $name, &$result ) { - $callback = array( $this, "{$name}__finalise" ) ; - if ( is_callable( $callback ) ) { - call_user_func_array( $callback, array( &$result ) ) ; + function finalise( &$result ) { + foreach ($this->typestack($result['_matchrule']) as $type) { + $callback = array( $this, "{$type}__finalise" ) ; + if ( is_callable( $callback ) ) { + call_user_func_array( $callback, array( &$result ) ) ; + break; + } } return $result ; @@ -185,16 +147,23 @@ function finalise( $name, &$result ) { function store ( &$result, $subres, $storetag = NULL ) { $result['text'] .= $subres['text'] ; - $globalcb = array( $this, "{$result['name']}_STR" ) ; - $callback = array( $this, $storetag ? "{$result['name']}_{$storetag}" : "{$result['name']}_{$subres['name']}" ) ; + $storecalled = false; - if ( is_callable( $callback ) ) { - call_user_func_array( $callback, array( &$result, $subres ) ) ; - } - elseif ( is_callable( $globalcb ) ) { - call_user_func_array( $globalcb, array( &$result, $subres ) ) ; - } - elseif ( $storetag ) { + foreach ($this->typestack($result['_matchrule']) as $type) { + $callback = array( $this, $storetag ? "{$type}_{$storetag}" : "{$type}_{$subres['name']}" ) ; + if ( is_callable( $callback ) ) { + call_user_func_array( $callback, array( &$result, $subres ) ) ; + $storecalled = true; break; + } + + $globalcb = array( $this, "{$type}_STR" ) ; + if ( is_callable( $globalcb ) ) { + call_user_func_array( $globalcb, array( &$result, $subres ) ) ; + $storecalled = true; break; + } + } + + if ( $storetag && !$storecalled ) { if ( !isset( $result[$storetag] ) ) $result[$storetag] = $subres ; else { if ( isset( $result[$storetag]['text'] ) ) $result[$storetag] = array( $result[$storetag] ) ; diff --git a/README.md b/README.md index d327dc2..807b3b7 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,6 @@ and lexing in a single top down grammar. For a basic overview of the subject, se Parsers are contained within a PHP file, in one or more special comment blocks that start with `/*!* [name | !pragma]` (like a docblock, but with an exclamation mark in the middle of the stars) -Lexically, these blocks are a set of rules, each consisting of a name token, a matching rule and a set of attached functions. -The name token must contain no whitespace and end with a `:` character. The matching rule and functions are on the same line or on the indented lines below. - You can have multiple comment blocks, all of which are treated as contiguous for the purpose of compiling. During compilation these blocks will be replaced with a set of "matching" functions (functions which match a string against their rules) for each rule in the block. @@ -30,6 +27,30 @@ If unspecified, it defaults to the same name as the previous parser comment bloc If the name starts with an '!' symbol, that comment block is a pragma, and is treated not as some part of the parser, but as a special block of meta-data +Lexically, these blocks are a set of rules & comments. A rule can be a base rule or an extension rule + +##### Base rules + +Base rules consist of a name for the rule, some optional arguments, the matching rule itself, and an optional set of attached functions + +NAME ( "(" ARGUMENT, ... ")" )? ":" MATCHING_RULE + ATTACHED_FUNCTIONS? + +Names must be the characters a-z, A-Z, 0-9 and _ only, and must not start with a number + +Base rules can be split over multiple lines as long as subsequent lines are indented + +##### Extension rules + +Extension rules are either the same as a base rule but with an addition name of the rule to extend, or as a replacing extension consist of +a name for the rule, the name of the rule to extend, and optionally: some arguments, some replacements, and a set of attached functions + +NAME extend BASENAME ( "(" ARGUMENT, ... ")" )? ":" MATCHING_RULE + ATTACHED_FUNCTIONS? + +NAME extends BASENAME ( "(" ARGUMENT, ... ")" )? ( ";" REPLACE "=>" REPLACE_WITH, ... )? + ATTACHED_FUNCTIONS? + ##### Tricks and traps We allow indenting a parser block, but only in a consistant manner - whatever the indent of the /*** marker becomes the "base" indent, and needs to be used @@ -46,7 +67,7 @@ This might get looser if I get around to re-writing the internal "parser parser" PEG matching rules try to follow standard PEG format, summarised thusly: -
+

 	token* - Token is optionally repeated
 	token+ - Token is repeated at least one
 	token? - Token is optionally present
@@ -62,7 +83,7 @@ PEG matching rules try to follow standard PEG format, summarised thusly:
 
 But with these extensions:
 
-
+

 	< or > - Optionally match whitespace
 	[ or ] - Require some whitespace
 
@@ -87,32 +108,56 @@ just split with a space (as in / foo \s* /) ### Expressions Expressions allow run-time calculated matching. You can embed an expression within a literal or regex token to -match against a calculated value, or simply specify the expression as a token to (optionally) internally handle matching -and generate a result. +match against a calculated value, or simply specify the expression as a token to match against a dynamic rule. + +#### Expression stack -Expressions will try a variety of scopes to find a value. It will look for variables already set in the current result, -rule-attached functions and a variety of other functions and constants. +When getting a value to use for an expression, the parser will travel up the stack looking for a set value. The expression +stack is a list of all the rules passed through to get to this point. For example, given the parser -Tried in this order +

+	A: $a
+	B: A
+	C: B
+
+ +The expression stack for finding $a will be C, B, A - in other words, the A rule will be checked first, followed by B, followed by C -- against current result -- against containing expression stack in order (for sub-expressions only) - - against parser instance as variable - - against parser instance as rule-attached method INCLUDING `$` ( i.e. `function $foo()` ) - - against parser instance as method INCLUDING `$` - - as global method -- as constant +#### In terminals (literals and regexes) -##### Tricks and traps +The token will be replaced by the looked up value. To find the value for the token, the expression stack will be +travelled up checking for one of the following: + + - A key / value pair in the result array node + - A rule-attached method INCLUDING `$` ( i.e. `function $foo()` ) + +If no value is found it will then check if a method or a property excluding the $ exists on the parser. If neither of those is found +the expression will be replaced with an exmpty string/ + +#### As tokens + +The token will be looked up to find a value, which must be the name of a matching rule. That rule will then be matched +against as if the token was a recurse token for that rule. -Be careful against matching against results +To find the name of the rule to match against, the expression stack will be travelled up checking for one of the following: + + - A key / value pair in the result array node + - A rule-attached method INCLUDING `$` ( i.e. `function $foo()` ) + +If no value is found it will then check if a method or a property excluding the $ exists on the parser. If neither of those if found +the rule will fail to match. + +#### Tricks and traps + +Be careful against using a token expression when you meant to use a terminal expression

 	quoted_good: q:/['"]/ string "$q"
 	quoted_bad:  q:/['"]/ string $q
 
-`"$q"` matches against the value of q again. `$q` simply returns the value of q, without doing any matching +`"$q"` matches against the value of q again. `$q` tries to match against a rule named `"` or `'` (both of which are illegal rule +names, and will therefore fail) ### Named matching rules @@ -149,16 +194,16 @@ All these definitions define the same rule-attached function

 	class A extends Parser {
-	/**Parser
-	foo: bar baz
-	  function bar() {}
-	* /
+		/*!* Parser
+		foo: bar baz
+			function bar() {}
+		*/
 
-	  function foo_bar() {}
+		function foo_bar() {}
 	}
 
 	class B extends A {
-	  function foo_bar() {}
+		function foo_bar() {}
 	}
 
@@ -206,6 +251,62 @@ You can also specify a rule-attached function called `*`, which will be called w By default all matches are added to the 'text' property of a result. By prepending a member with `.` that match will not be added to the ['text'] member. This doesn't affect the other result properties that named rules' add. +### Inheritance + +Rules can inherit off other rules using the keyword extends. There are several ways to change the matching of the rule, but +they all share a common feature - when building a result set the rule will also check the inherited-from rule's rule-attached +functions for storage handlers. This lets you do something like + +

+A: Foo Bar Baz
+  function *(){ /* Generic store handler */ }
+  
+B extends A
+  function Bar(){ /* Custom handling for Bar - Foo and Baz will still fall through to the A#* function defined above */ }
+
+ +The actual matching rule can be specified in three ways: + +#### Duplication + +If you don't specify a new rule or a replacement set the matching rule is copied as is. This is useful when you want to +override some storage logic but not the rule itself + +#### Text replacement + +You can replace some parts of the inherited rule using test replacement by using a ';' instead of an ':' after the name + of the extended rule. You can then put replacements in a comma seperated list. An example might help + +

+A: Foo | Bar | Baz
+
+# Makes B the equivalent of Foo | Bar | (Baz | Qux)
+B extends A: Baz => (Baz | Qux)
+
+ +Note that the replacements are not quoted. The exception is when you want to replace with the empty string, e.g. + +

+A: Foo | Bar | Baz
+
+# Makes B the equivalent of Foo | Bar
+B extends A: | Baz => ""
+
+ +Currently there is no escaping supported - if you want to replace "," or "=>" characters you'll have to use full replacement + +#### Full replacement + +You can specify an entirely new rule in the same format as a non-inheriting rule, eg. + +

+A: Foo | Bar | Baz
+
+B extends A: Foo | Bar | (Baz Qux)
+
+ +This is useful is the rule changes too much for text replacement to be readable, but want to keep the storage logic + ### Pragmas When opening a parser comment block, if instead of a name (or no name) you put a word starting with '!', that comment block is treated as a pragma - not @@ -225,6 +326,4 @@ part of the parser language itself, but some other instruction to the compiler. - Allow inline-ing of rules into other rules for speed - More optimisation - Make Parser-parser be self-generated, instead of a bad hand rolled parser like it is now. -- Slighly more powerfull expressions: `${parent.q}`, `${foo()->bar}`, etc. -- Need to properly escape all literals. Expressions currently need to be in '', not "" - PHP token parser, and other token streams, instead of strings only like now diff --git a/tests/ParserInheritanceTest.php b/tests/ParserInheritanceTest.php new file mode 100644 index 0000000..c652fac --- /dev/null +++ b/tests/ParserInheritanceTest.php @@ -0,0 +1,123 @@ +buildParser(' + /*!* BasicInheritanceTestParser + Foo: "a" + Bar extends Foo + */ + '); + + $this->assertTrue($parser->matches('Foo', 'a')); + $this->assertTrue($parser->matches('Bar', 'a')); + + $this->assertFalse($parser->matches('Foo', 'b')); + $this->assertFalse($parser->matches('Bar', 'b')); + } + + + public function testBasicInheritanceConstructFallback() { + + $parser = $this->buildParser(' + /*!* BasicInheritanceConstructFallbackParser + Foo: "a" + function __construct(&$res){ $res["test"] = "test"; } + Bar extends Foo + */ + '); + + $res = $parser->match('Foo', 'a'); + $this->assertEquals($res['test'], 'test'); + + $res = $parser->match('Bar', 'a'); + $this->assertEquals($res['test'], 'test'); + + $parser = $this->buildParser(' + /*!* BasicInheritanceConstructFallbackParser2 + Foo: "a" + function __construct(&$res){ $res["testa"] = "testa"; } + Bar extends Foo + function __construct(&$res){ $res["testb"] = "testb"; } + */ + '); + + $res = $parser->match('Foo', 'a'); + $this->assertArrayHasKey('testa', $res); + $this->assertEquals($res['testa'], 'testa'); + $this->assertArrayNotHasKey('testb', $res); + + $res = $parser->match('Bar', 'a'); + $this->assertArrayHasKey('testb', $res); + $this->assertEquals($res['testb'], 'testb'); + $this->assertArrayNotHasKey('testa', $res); + + } + + public function testBasicInheritanceStoreFallback() { + + $parser = $this->buildParser(' + /*!* BasicInheritanceStoreFallbackParser + Foo: Pow:"a" + function *(&$res, $sub){ $res["test"] = "test"; } + Bar extends Foo + */ + '); + + $res = $parser->match('Foo', 'a'); + $this->assertEquals($res['test'], 'test'); + + $res = $parser->match('Bar', 'a'); + $this->assertEquals($res['test'], 'test'); + + $parser = $this->buildParser(' + /*!* BasicInheritanceStoreFallbackParser2 + Foo: Pow:"a" Zap:"b" + function *(&$res, $sub){ $res["testa"] = "testa"; } + Bar extends Foo + function *(&$res, $sub){ $res["testb"] = "testb"; } + Baz extends Foo + function Zap(&$res, $sub){ $res["testc"] = "testc"; } + */ + '); + + $res = $parser->match('Foo', 'ab'); + $this->assertArrayHasKey('testa', $res); + $this->assertEquals($res['testa'], 'testa'); + $this->assertArrayNotHasKey('testb', $res); + + $res = $parser->match('Bar', 'ab'); + $this->assertArrayHasKey('testb', $res); + $this->assertEquals($res['testb'], 'testb'); + $this->assertArrayNotHasKey('testa', $res); + + $res = $parser->match('Baz', 'ab'); + $this->assertArrayHasKey('testa', $res); + $this->assertEquals($res['testa'], 'testa'); + $this->assertArrayHasKey('testc', $res); + $this->assertEquals($res['testc'], 'testc'); + $this->assertArrayNotHasKey('testb', $res); + } + + public function testInheritanceByReplacement() { + $parser = $this->buildParser(' + /*!* InheritanceByReplacementParser + A: "a" + B: "b" + Foo: A B + Bar extends Foo; B => A + Baz extends Foo; A => "" + */ + '); + + $parser->assertMatches('Foo', 'ab'); + $parser->assertMatches('Bar', 'aa'); + $parser->assertMatches('Baz', 'b'); + } + + +} \ No newline at end of file diff --git a/tests/ParserSyntaxTest.php b/tests/ParserSyntaxTest.php new file mode 100644 index 0000000..74defdd --- /dev/null +++ b/tests/ParserSyntaxTest.php @@ -0,0 +1,26 @@ +buildParser(' + /*!* BasicRuleSyntax + Foo: "a" "b" + Bar: "a" + "b" + Baz: + "a" "b" + Qux: + "a" + "b" + */ + '); + + $parser->assertMatches('Foo', 'ab'); + $parser->assertMatches('Bar', 'ab'); + $parser->assertMatches('Baz', 'ab'); + $parser->assertMatches('Qux', 'ab'); + } +} \ No newline at end of file diff --git a/tests/ParserTestBase.php b/tests/ParserTestBase.php new file mode 100644 index 0000000..90a148c --- /dev/null +++ b/tests/ParserTestBase.php @@ -0,0 +1,48 @@ +testcase = $testcase; + $this->class = $class; + } + + function match($method, $string, $allowPartial = false){ + $class = $this->class; + $func = 'match_'.$method; + + $parser = new $class($string); + $res = $parser->$func(); + return ($allowPartial || $parser->pos == strlen($string)) ? $res : false; + } + + function matches($method, $string, $allowPartial = false){ + return $this->match($method, $string, $allowPartial) !== false; + } + + function assertMatches($method, $string, $message = null){ + $this->testcase->assertTrue($this->matches($method, $string), $message ? $message : "Assert parser method $method matches string $string"); + } + + function assertDoesntMatch($method, $string, $message = null){ + $this->testcase->assertFalse($this->matches($method, $string), $message ? $message : "Assert parser method $method doesn't match string $string"); + } +} + +class ParserTestBase extends PHPUnit_Framework_TestCase { + + function buildParser($parser) { + $class = 'Parser'.sha1($parser); + + echo ParserCompiler::compile("class $class extends Parser {\n $parser\n}") . "\n\n\n"; + eval(ParserCompiler::compile("class $class extends Parser {\n $parser\n}")); + return new ParserTestWrapper($this, $class); + } + +} \ No newline at end of file diff --git a/tests/ParserVariablesTest.php b/tests/ParserVariablesTest.php new file mode 100644 index 0000000..e941200 --- /dev/null +++ b/tests/ParserVariablesTest.php @@ -0,0 +1,55 @@ +buildParser(' + /*!* BasicVariables + Foo: Letter:"a" "$Letter" + Bar: Letter:"b" "$Letter $Letter" + Baz: Letter:"c" "$Letter a $Letter a" + Qux: Letter:"d" "{$Letter}a{$Letter}a" + */ + '); + + $parser->assertMatches('Foo', 'aa'); + $parser->assertMatches('Bar', 'bb b'); + $parser->assertMatches('Baz', 'cc a c a'); + $parser->assertMatches('Qux', 'ddada'); + } + + public function testRecurseOnVariables() { + $parser = $this->buildParser(' + /*!* RecurseOnVariablesParser + A: "a" + B: "b" + Foo: $Template + Bar: Foo + function __construct(&$res){ $res["Template"] = "A"; } + Baz: Foo + function __construct(&$res){ $res["Template"] = "B"; } + */ + '); + + $parser->assertMatches('Bar', 'a'); $parser->assertDoesntMatch('Bar', 'b'); + $parser->assertMatches('Baz', 'b'); $parser->assertDoesntMatch('Baz', 'a'); + } + + public function testSetOnRuleVariables() { + $parser = $this->buildParser(' + /*!* SetOnRuleVariablesParser + A: "a" + B: "b" + Foo: $Template + Bar (Template = A): Foo + Baz (Template = B): Foo + */ + '); + + $parser->assertMatches('Bar', 'a'); + $parser->assertMatches('Baz', 'b'); + } + +} \ No newline at end of file