From feebfe8a55ceac8b62de8a12c117d7c42a2df2d7 Mon Sep 17 00:00:00 2001
From: Ryan Davis <ryand@zenspider.com>
Date: Thu, 12 Apr 2012 13:14:07 -0800
Subject: [PATCH] Added compare18 and compare19 tasks to help me diff and
 compare against MRI - Renamed awords to qwords to match stupid MRI naming.
 (1.8, 1.9) :( - Fixed reswords to match MRI (1.8, 1.9) - Entirely reworked
 block arg handling. (1.8) - Added missing gvar arg error. (1.8) - Split
 block_var from for_var. (1.8, 1.9) - Made lambda w/o arg list zero out the
 arg slot. - Split 1.8 from 1.9 open paren lexer. Gawd that's ugly code.
 Refactored both 1.8 and 1.9 open paren lexing code into separate methods.
 Added d method to help debugging output inline with debugging racc output.
 Added block_var to handle generating all block_var nodes

[git-p4: depot-paths = "//src/ruby_parser/dev/": change = 7324]
---
 Rakefile                  | 16 +++++++
 lib/ruby18_parser.y       | 83 +++++++++++++++++++++++++++++-----
 lib/ruby19_parser.y       | 75 +++++++++++++++++++++++++++----
 lib/ruby_lexer.rb         | 93 ++++++++++++++++++++++++++-------------
 lib/ruby_parser_extras.rb | 31 ++++++++++++-
 test/test_ruby_lexer.rb   | 10 ++---
 6 files changed, 253 insertions(+), 55 deletions(-)

diff --git a/Rakefile b/Rakefile
index a19c95e3..1fd7be05 100644
--- a/Rakefile
+++ b/Rakefile
@@ -122,4 +122,20 @@ task :isolate => :phony
 file "lib/ruby18_parser.rb" => :isolate
 file "lib/ruby19_parser.rb" => :isolate
 
+task :compare18 do
+  sh "./yack.rb lib/ruby18_parser.output > racc18.txt"
+  sh "./yack.rb parse18.output > yacc18.txt"
+  sh "diff -du racc18.txt yacc18.txt || true"
+  puts
+  sh "diff -du racc18.txt yacc18.txt | wc -l"
+end
+
+task :compare19 do
+  sh "./yack.rb lib/ruby19_parser.output > racc19.txt"
+  sh "./yack.rb parse19.output > yacc19.txt"
+  sh "diff -du racc19.txt yacc19.txt || true"
+  puts
+  sh "diff -du racc19.txt yacc19.txt | wc -l"
+end
+
 # vim: syntax=Ruby
diff --git a/lib/ruby18_parser.y b/lib/ruby18_parser.y
index 7b2b1f2a..68213dff 100644
--- a/lib/ruby18_parser.y
+++ b/lib/ruby18_parser.y
@@ -15,7 +15,7 @@ token kCLASS kMODULE kDEF kUNDEF kBEGIN kRESCUE kENSURE kEND kIF kUNLESS
       tLBRACK tRBRACK tLBRACE tLBRACE_ARG tSTAR tSTAR2 tAMPER tAMPER2
       tTILDE tPERCENT tDIVIDE tPLUS tMINUS tLT tGT tPIPE tBANG tCARET
       tLCURLY tRCURLY tBACK_REF2 tSYMBEG tSTRING_BEG tXSTRING_BEG tREGEXP_BEG
-      tWORDS_BEG tAWORDS_BEG tSTRING_DBEG tSTRING_DVAR tSTRING_END tSTRING
+      tWORDS_BEG tQWORDS_BEG tSTRING_DBEG tSTRING_DVAR tSTRING_END tSTRING
       tSYMBOL tNL tEH tCOLON tCOMMA tSPACE tSEMI tLAST_TOKEN
 
 prechigh
@@ -495,7 +495,7 @@ rule
                 | kFOR      | kIN         | kMODULE | kNEXT  | kNIL    | kNOT
                 | kOR       | kREDO       | kRESCUE | kRETRY | kRETURN | kSELF
                 | kSUPER    | kTHEN       | kTRUE   | kUNDEF | kWHEN   | kYIELD
-                | kIF_MOD   | kUNLESS_MOD | kWHILE_MOD | kUNTIL_MOD | kRESCUE_MOD
+                | kIF       | kUNLESS     | kWHILE  | kUNTIL
 
              arg: lhs tEQL arg
                     {
@@ -881,7 +881,7 @@ rule
                     {
                       result = val[1]
                     }
-                | none_block_pass
+                | none
 
             args: arg_value
                     {
@@ -910,7 +910,7 @@ rule
                 | xstring
                 | regexp
                 | words
-                | awords
+                | qwords
                 | var_ref
                 | backref
                 | tFID
@@ -1044,7 +1044,7 @@ rule
                     {
                       result = new_case nil, val[3]
                     }
-                | kFOR block_var kIN
+                | kFOR for_var kIN
                     {
                       lexer.cond.push true
                     }
@@ -1184,12 +1184,71 @@ rule
                       result = val[1]
                     }
 
-       block_var: lhs
+         for_var: lhs
                 | mlhs
                     {
                       val[0].delete_at 1 if val[0][1].nil? # HACK
                     }
 
+       block_par: mlhs_item
+                    {
+                      result = s(:array, val[0])
+                    }
+                | block_par tCOMMA mlhs_item
+                    {
+                      result = self.list_append val[0], val[2]
+                    }
+
+       block_var: block_par
+                    {
+                      result = block_var val[0], nil, nil
+                    }
+                | block_par tCOMMA
+                    {
+                      result = block_var val[0], nil, nil
+                    }
+                | block_par tCOMMA tAMPER lhs
+                    {
+                      result = block_var val[0], nil, val[3]
+                    }
+                | block_par tCOMMA tSTAR lhs tCOMMA tAMPER lhs
+                    {
+                      result = block_var val[0], val[3], val[6]
+                    }
+                | block_par tCOMMA tSTAR tCOMMA tAMPER lhs
+                    {
+                      result = block_var val[0], s(:splat), val[5]
+                    }
+                | block_par tCOMMA tSTAR lhs
+                    {
+                      result = block_var val[0], val[3], nil
+                    }
+                | block_par tCOMMA tSTAR
+                    {
+                      result = block_var val[0], s(:splat), nil
+                    }
+                | tSTAR lhs tCOMMA tAMPER lhs
+                    {
+                      result = block_var nil, val[1], val[4]
+                    }
+                | tSTAR tCOMMA tAMPER lhs
+                    {
+                      result = block_var nil, s(:splat), val[3]
+                    }
+                | tSTAR lhs
+                    {
+                      result = block_var nil, val[1], nil
+                    }
+                | tSTAR
+                    {
+                      result = block_var nil, s(:splat), nil
+                    }
+                | tAMPER lhs
+                    {
+                      result = block_var nil, nil, val[1]
+                    }
+                ;
+
    opt_block_var: none
                 | tPIPE tPIPE
                     {
@@ -1429,11 +1488,11 @@ rule
                       result = self.literal_concat val[0], val[1]
                     }
 
-          awords: tAWORDS_BEG tSPACE tSTRING_END
+          qwords: tQWORDS_BEG tSPACE tSTRING_END
                     {
                       result = s(:array)
                     }
-                | tAWORDS_BEG qword_list tSTRING_END
+                | tQWORDS_BEG qword_list tSTRING_END
                     {
                       result = val[1]
                     }
@@ -1650,12 +1709,16 @@ xstring_contents: none
 
       f_norm_arg: tCONSTANT
                     {
-                      yyerror "formal argument cannot be a constant: #{val[0]}"
+                      yyerror "formal argument cannot be a constant"
                     }
                 | tIVAR
                     {
                       yyerror "formal argument cannot be an instance variable"
                     }
+                | tGVAR
+                    {
+                      yyerror "formal argument cannot be a global variable"
+                    }
                 | tCVAR
                     {
                       yyerror "formal argument cannot be a class variable"
@@ -1788,8 +1851,6 @@ xstring_contents: none
 
             none: { result = nil }
 
- none_block_pass: { result = nil }
-
 end
 
 ---- inner
diff --git a/lib/ruby19_parser.y b/lib/ruby19_parser.y
index 408c8855..f35eb82e 100644
--- a/lib/ruby19_parser.y
+++ b/lib/ruby19_parser.y
@@ -15,7 +15,7 @@ token kCLASS kMODULE kDEF kUNDEF kBEGIN kRESCUE kENSURE kEND kIF kUNLESS
       tLBRACK tRBRACK tLBRACE tLBRACE_ARG tSTAR tSTAR2 tAMPER tAMPER2
       tTILDE tPERCENT tDIVIDE tPLUS tMINUS tLT tGT tPIPE tBANG tCARET
       tLCURLY tRCURLY tBACK_REF2 tSYMBEG tSTRING_BEG tXSTRING_BEG tREGEXP_BEG
-      tWORDS_BEG tAWORDS_BEG tSTRING_DBEG tSTRING_DVAR tSTRING_END tSTRING
+      tWORDS_BEG tQWORDS_BEG tSTRING_DBEG tSTRING_DVAR tSTRING_END tSTRING
       tSYMBOL tNL tEH tCOLON tCOMMA tSPACE tSEMI tLAST_TOKEN tLAMBDA tLAMBEG
 
 prechigh
@@ -495,7 +495,7 @@ rule
                 | kFOR      | kIN         | kMODULE | kNEXT  | kNIL    | kNOT
                 | kOR       | kREDO       | kRESCUE | kRETRY | kRETURN | kSELF
                 | kSUPER    | kTHEN       | kTRUE   | kUNDEF | kWHEN   | kYIELD
-                | kIF_MOD   | kUNLESS_MOD | kWHILE_MOD | kUNTIL_MOD | kRESCUE_MOD
+                | kIF       | kUNLESS     | kWHILE  | kUNTIL
 
              arg: lhs tEQL arg
                     {
@@ -925,7 +925,7 @@ rule
                 | xstring
                 | regexp
                 | words
-                | awords
+                | qwords
                 | var_ref
                 | backref
                 | tFID
@@ -1063,7 +1063,7 @@ rule
                     {
                       result = new_case nil, val[3]
                     }
-                | kFOR block_var kIN
+                | kFOR for_var kIN
                     {
                       lexer.cond.push true
                     }
@@ -1203,12 +1203,71 @@ rule
                       result = val[1]
                     }
 
-       block_var: lhs
+         for_var: lhs
                 | mlhs
                     {
                       val[0].delete_at 1 if val[0][1].nil? # HACK
                     }
 
+       block_par: mlhs_item
+                    {
+                      result = s(:array, val[0])
+                    }
+                | block_par tCOMMA mlhs_item
+                    {
+                      result = self.list_append val[0], val[2]
+                    }
+
+       block_var: block_par
+                    {
+                      result = block_var val[0], nil, nil
+                    }
+                | block_par tCOMMA
+                    {
+                      result = block_var val[0], nil, nil
+                    }
+                | block_par tCOMMA tAMPER lhs
+                    {
+                      result = block_var val[0], nil, val[3]
+                    }
+                | block_par tCOMMA tSTAR lhs tCOMMA tAMPER lhs
+                    {
+                      result = block_var val[0], val[3], val[6]
+                    }
+                | block_par tCOMMA tSTAR tCOMMA tAMPER lhs
+                    {
+                      result = block_var val[0], s(:splat), val[5]
+                    }
+                | block_par tCOMMA tSTAR lhs
+                    {
+                      result = block_var val[0], val[3], nil
+                    }
+                | block_par tCOMMA tSTAR
+                    {
+                      result = block_var val[0], s(:splat), nil
+                    }
+                | tSTAR lhs tCOMMA tAMPER lhs
+                    {
+                      result = block_var nil, val[1], val[4]
+                    }
+                | tSTAR tCOMMA tAMPER lhs
+                    {
+                      result = block_var nil, s(:splat), val[3]
+                    }
+                | tSTAR lhs
+                    {
+                      result = block_var nil, val[1], nil
+                    }
+                | tSTAR
+                    {
+                      result = block_var nil, s(:splat), nil
+                    }
+                | tAMPER lhs
+                    {
+                      result = block_var nil, nil, val[1]
+                    }
+                ;
+
    opt_block_var: none
                 | tPIPE tPIPE
                     {
@@ -1299,7 +1358,7 @@ rule
           lambda: lambda_body
                     {
                       call = s(:call, nil, :lambda, s(:arglist))
-                      result = s(:iter, call, nil, val[0])
+                      result = s(:iter, call, 0, val[0])
                     }
                 | f_larglist lambda_body
                     {
@@ -1492,11 +1551,11 @@ rule
                       result = self.literal_concat val[0], val[1]
                     }
 
-          awords: tAWORDS_BEG tSPACE tSTRING_END
+          qwords: tQWORDS_BEG tSPACE tSTRING_END
                     {
                       result = s(:array)
                     }
-                | tAWORDS_BEG qword_list tSTRING_END
+                | tQWORDS_BEG qword_list tSTRING_END
                     {
                       result = val[1]
                     }
diff --git a/lib/ruby_lexer.rb b/lib/ruby_lexer.rb
index e6d722a2..2399615c 100644
--- a/lib/ruby_lexer.rb
+++ b/lib/ruby_lexer.rb
@@ -42,7 +42,7 @@ class RubyLexer
   STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
   STR_FUNC_EXPAND = 0x02
   STR_FUNC_REGEXP = 0x04
-  STR_FUNC_AWORDS = 0x08
+  STR_FUNC_QWORDS = 0x08
   STR_FUNC_SYMBOL = 0x10
   STR_FUNC_INDENT = 0x20 # <<-HEREDOC
 
@@ -314,10 +314,10 @@ def parse_quote # 58 lines
                                 [:tSTRING_BEG,   STR_SQUOTE]
                               when 'W' then
                                 src.scan(/\s*/)
-                                [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_AWORDS]
+                                [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_QWORDS]
                               when 'w' then
                                 src.scan(/\s*/)
-                                [:tAWORDS_BEG,   STR_SQUOTE | STR_FUNC_AWORDS]
+                                [:tQWORDS_BEG,   STR_SQUOTE | STR_FUNC_QWORDS]
                               when 'x' then
                                 [:tXSTRING_BEG,  STR_XQUOTE]
                               when 'r' then
@@ -343,7 +343,7 @@ def parse_string(quote) # 65 lines
     paren = open
     term_re = Regexp.escape term
 
-    awords = (func & STR_FUNC_AWORDS) != 0
+    qwords = (func & STR_FUNC_QWORDS) != 0
     regexp = (func & STR_FUNC_REGEXP) != 0
     expand = (func & STR_FUNC_EXPAND) != 0
 
@@ -352,10 +352,10 @@ def parse_string(quote) # 65 lines
       return :tSTRING_END
     end
 
-    space = true if awords and src.scan(/\s+/)
+    space = true if qwords and src.scan(/\s+/)
 
     if self.nest == 0 && src.scan(/#{term_re}/) then
-      if awords then
+      if qwords then
         quote[1] = nil
         return :tSPACE
       elsif regexp then
@@ -508,7 +508,7 @@ def tokadd_escape term # 20 lines
   end
 
   def tokadd_string(func, term, paren) # 105 lines
-    awords = (func & STR_FUNC_AWORDS) != 0
+    qwords = (func & STR_FUNC_QWORDS) != 0
     escape = (func & STR_FUNC_ESCAPE) != 0
     expand = (func & STR_FUNC_EXPAND) != 0
     regexp = (func & STR_FUNC_REGEXP) != 0
@@ -528,7 +528,7 @@ def tokadd_string(func, term, paren) # 105 lines
         self.nest += 1
       when src.scan(term_re) then
         self.nest -= 1
-      when awords && src.scan(/\s/) then
+      when qwords && src.scan(/\s/) then
         src.pos -= 1
         break
       when expand && src.scan(/#(?=[\$\@\{])/) then
@@ -538,10 +538,10 @@ def tokadd_string(func, term, paren) # 105 lines
         # do nothing
       when src.check(/\\/) then
         case
-        when awords && src.scan(/\\\n/) then
+        when qwords && src.scan(/\\\n/) then
           string_buffer << "\n"
           next
-        when awords && src.scan(/\\\s/) then
+        when qwords && src.scan(/\\\s/) then
           c = ' '
         when expand && src.scan(/\\\n/) then
           next
@@ -570,7 +570,7 @@ def tokadd_string(func, term, paren) # 105 lines
 
         t = Regexp.escape term
         x = Regexp.escape(paren) if paren && paren != "\000"
-        re = if awords then
+        re = if qwords then
                /[^#{t}#{x}\#\0\\\n\ ]+|./ # |. to pick up whatever
              else
                /[^#{t}#{x}\#\0\\]+|./
@@ -710,24 +710,11 @@ def yylex # 826 lines
             return :tDOT
           end
         elsif src.scan(/\(/) then
-          result = :tLPAREN2
-
-          self.command_start = true if ruby18
-
-          if lex_state == :expr_beg || lex_state == :expr_mid then
-            result = :tLPAREN
-          elsif space_seen then
-            if lex_state == :expr_cmdarg then
-              result = :tLPAREN_ARG
-            elsif lex_state == :expr_arg then
-              self.tern.push false
-              warning("don't put space before argument parentheses")
-
-              result = :tLPAREN2
-            end
-          else
-            self.tern.push false
-          end
+          result = if ruby18 then
+                     yylex_paren18 space_seen
+                   else
+                     yylex_paren19 space_seen
+                   end
 
           self.expr_beg_push "("
 
@@ -1244,6 +1231,53 @@ def yylex # 826 lines
     end
   end
 
+  def yylex_paren18 space_seen
+    self.command_start = true
+    result = :tLPAREN2
+
+    if lex_state == :expr_beg || lex_state == :expr_mid then
+      result = :tLPAREN
+    elsif space_seen then
+      if lex_state == :expr_cmdarg then
+        result = :tLPAREN_ARG
+      elsif lex_state == :expr_arg then
+        self.tern.push false
+        warning "don't put space before argument parentheses"
+      end
+    else
+      self.tern.push false
+    end
+
+    result
+  end
+
+  def yylex_paren19 space_seen
+    if (lex_state == :expr_beg || lex_state == :expr_mid ||
+        lex_state == :expr_value || lex_state == :expr_class) then
+      result = :tLPAREN
+    elsif ((lex_state == :expr_arg || lex_state == :expr_cmdarg) and
+           space_seen) then
+      result = :tLPAREN_ARG
+    else
+      self.tern.push false
+      result = :tLPAREN2
+    end
+    # HACK paren_nest++;
+
+    # HACK: this is a mess, but it makes the tests pass, so suck it
+    # (stolen from the 1.8 side)
+    if lex_state == :expr_beg || lex_state == :expr_mid then
+      # do nothing
+    elsif space_seen then
+      if lex_state == :expr_arg then
+        self.tern.push false
+      end
+    else
+      self.tern.push false
+    end
+    result
+  end
+
   def process_token(command_state)
 
     token << src.matched if token =~ /^\w/ && src.scan(/[\!\?](?!=)/)
@@ -1251,7 +1285,6 @@ def process_token(command_state)
     result = nil
     last_state = lex_state
 
-
     case token
     when /^\$/ then
       self.lex_state, result = :expr_end, :tGVAR
diff --git a/lib/ruby_parser_extras.rb b/lib/ruby_parser_extras.rb
index 4fb3e8e6..4f539d4b 100644
--- a/lib/ruby_parser_extras.rb
+++ b/lib/ruby_parser_extras.rb
@@ -3,6 +3,10 @@
 require 'sexp'
 require 'strscan'
 
+def d o
+  $stderr.puts o.inspect
+end
+
 # WHY do I have to do this?!?
 class Regexp
   ONCE = 0 unless defined? ONCE # FIX: remove this - it makes no sense
@@ -66,7 +70,7 @@ def getch
     alias :old_scan :scan
     def scan re
       s = old_scan re
-      p :scan => [s, caller.first] if s
+      d :scan => [s, caller.first] if s
       s
     end
   end
@@ -137,6 +141,31 @@ def arg_concat node1, node2 # TODO: nuke
     node1
   end
 
+  def block_var ary, splat, block
+    ary ||= s(:array)
+
+    if splat then
+      if splat == s(:splat) then
+        ary << splat
+      else
+        ary << s(:splat, splat)
+      end
+    end
+
+    if block then
+      block[-1] = :"&#{block[-1]}"
+      ary << block
+    end
+
+    result = if ary.length > 2 or ary.splat then
+               s(:masgn, ary)
+             else
+               ary.last
+             end
+
+    result
+  end
+
   def args arg, optarg, rest_arg, block_arg, post_arg = nil
     arg ||= s(:args)
 
diff --git a/test/test_ruby_lexer.rb b/test/test_ruby_lexer.rb
index 8470b6f2..e7fe9bdf 100755
--- a/test/test_ruby_lexer.rb
+++ b/test/test_ruby_lexer.rb
@@ -1693,7 +1693,7 @@ def test_yylex_string_pct_other
 
   def test_yylex_string_pct_w
     util_bad_token("%w[s1 s2 ",
-                   :tAWORDS_BEG,     "%w[",
+                   :tQWORDS_BEG,     "%w[",
                    :tSTRING_CONTENT, "s1",
                    :tSPACE,              nil,
                    :tSTRING_CONTENT, "s2",
@@ -1702,7 +1702,7 @@ def test_yylex_string_pct_w
 
   def test_yylex_string_pct_w_bs_nl
     util_lex_token("%w[s1 \\\ns2]",
-                   :tAWORDS_BEG,     "%w[",
+                   :tQWORDS_BEG,     "%w[",
                    :tSTRING_CONTENT, "s1",
                    :tSPACE,              nil,
                    :tSTRING_CONTENT, "\ns2",
@@ -1712,7 +1712,7 @@ def test_yylex_string_pct_w_bs_nl
 
   def test_yylex_string_pct_w_bs_sp
     util_lex_token("%w[s\\ 1 s\\ 2]",
-                   :tAWORDS_BEG,     "%w[",
+                   :tQWORDS_BEG,     "%w[",
                    :tSTRING_CONTENT, "s 1",
                    :tSPACE,              nil,
                    :tSTRING_CONTENT, "s 2",
@@ -1722,7 +1722,7 @@ def test_yylex_string_pct_w_bs_sp
 
   def test_yylex_string_pct_w_tab
     util_lex_token("%w[abc\tdef]",
-                   :tAWORDS_BEG,      "%w[",
+                   :tQWORDS_BEG,      "%w[",
                    :tSTRING_CONTENT, "abc\tdef",
                    :tSPACE,              nil,
                    :tSTRING_END,     nil)
@@ -1899,7 +1899,7 @@ def util_lex_token input, *args
       token = args.shift
       value = args.shift
       assert @lex.advance, "no more tokens"
-      assert_equal [token, value], [@lex.token, [@lex.yacc_value].flatten.first]
+      assert_equal [token, value], [@lex.token, [@lex.yacc_value].flatten.first], input
     end
 
     deny @lex.advance, "must be empty, but had #{[@lex.token, @lex.yacc_value].inspect}"