From 83defc01c0587f738632a671d095e4cca6d319dd Mon Sep 17 00:00:00 2001
From: Daniel Pittman <daniel@puppetlabs.com>
Date: Thu, 12 Jul 2012 16:00:07 -0700
Subject: [PATCH] zaml: rework strings for correctness and speed

This changes the implementation of string output in the ZAML encoder.  On my
test that delivers around a fifty percent reduction in output time, from 4.11
to 1.89 seconds on my test of ~ 35,000 output strings.

It also adds a bunch of tests to validate the correctness of binary output
handling, and some test inputs derived from various security related UTF-8
encoding problems to validate that (sadly) we pass them through as
binary data.

Optimally we would treat binary and text input as distinct in the parser and
compiler of Puppet, then only allow legal UTF-8 in strings - but that is not
the world in which we live.

It finally fixes a bug in the binary output along the way - which, thankfully,
nobody actually seems to have run into since I introduced it in ea0dd1483fc2.

Because the Ruby BASE64 encoder will split lines at the 60 character mark, a
long binary input would end up with embedded newlines in places that were not
legal for YAML.

Instead of just emitting that, we run through the string encoder over the
transformed data, which leads to a solid output format that works entirely
as expected.

Signed-off-by: Daniel Pittman <daniel@puppetlabs.com>

Conflicts:
	lib/puppet/util/zaml.rb
	spec/unit/util/zaml_spec.rb

Signed-off-by: Daniel Pittman <daniel@puppetlabs.com>
---
 lib/puppet/util/zaml.rb     | 91 +++++++++++++++++------------------
 spec/unit/util/zaml_spec.rb | 94 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 138 insertions(+), 47 deletions(-)

diff --git a/lib/puppet/util/zaml.rb b/lib/puppet/util/zaml.rb
index dfd2236e691..445b10d13bd 100644
--- a/lib/puppet/util/zaml.rb
+++ b/lib/puppet/util/zaml.rb
@@ -249,51 +249,52 @@ def self.yaml_new( klass, tag, val )
 end
 
 class String
-  ZAML_ESCAPES = %w{\x00 \x01 \x02 \x03 \x04 \x05 \x06 \a \x08 \t \n \v \f \r \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \e \x1c \x1d \x1e \x1f }
-  def escaped_for_zaml
-    # JJM (Note the trailing dots to construct a multi-line method chain.) This
-    # code is meant to escape all bytes which are not ASCII-8BIT printable
-    # characters.  Multi-byte unicode characters are handled just fine because
-    # each byte of the character results in an escaped string emitted to the
-    # YAML stream.  When the YAML is de-serialized back into a String the bytes
-    # will be reconstructed properly into the unicode character.
-    self.to_ascii8bit.gsub( /\x5C/n, "\\\\\\" ).  # Demi-kludge for Maglev/rubinius; the regexp should be /\\/ but parsetree chokes on that.
-    gsub( /"/n, "\\\"" ).
-    gsub( /([\x00-\x1F])/n ) { |x| ZAML_ESCAPES[ x.unpack("C")[0] ] }.
-    gsub( /([\x80-\xFF])/n ) { |x| "\\x#{x.unpack("C")[0].to_s(16)}" }
-  end
+  ZAML_ESCAPES = {
+    "\a" => "\\a", "\e" => "\\e", "\f" => "\\f", "\n" => "\\n",
+    "\r" => "\\r", "\t" => "\\t", "\v" => "\\v"
+  }
+
   def to_zaml(z)
-    z.first_time_only(self) {
-      hex_num = '0x[a-f\d]+'
-      float = '\d+\.?\d*'
-      num = "[-+]?(?:#{float}|#{hex_num})"
+    z.first_time_only(self) do
       case
-        when self == ''
-          z.emit('""')
-        # when self =~ /[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF]/
-        #   z.emit("!binary |\n")
-        #   z.emit([self].pack("m*"))
-        when (
-          (self =~ /\A(true|false|yes|no|on|null|off|#{num}(:#{num})*|!|=|~)$/i) or
-          (self =~ /\A\n* /) or
-          (self =~ /[\s:]$/) or
-          (self =~ /^[>|][-+\d]*\s/i) or
-          (self[-1..-1] =~ /\s/) or
-          # This regular expression assumes the string is a byte sequence.
-          # It does not concern itself with characters so we convert the string
-          # to ASCII-8BIT for Ruby 1.9 to match up encodings.
-          (self.to_ascii8bit=~ /[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\xFF]/n) or
-          (self =~ /[,\[\]\{\}\r\t]|:\s|\s#/) or
-          (self =~ /\A([-:?!#&*'"]|<<|%.+:.)/)
-          )
-          z.emit("\"#{escaped_for_zaml}\"")
-        when self =~ /\n/
-          if self[-1..-1] == "\n" then z.emit('|+') else z.emit('|-') end
-          z.nested { split("\n",-1).each { |line| z.nl; z.emit(line.chomp("\n")) } }
-        else
-          z.emit(self)
+      when self == ''
+        z.emit('""')
+      when self.to_ascii8bit !~ /\A(?: # ?: non-capturing group (grouping with no back references)
+                 [\x09\x0A\x0D\x20-\x7E]            # ASCII
+               | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+               |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+               | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
+               |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+               |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+               | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+               |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+               )*\z/mnx
+        # Emit the binary tag, then recurse. Ruby splits BASE64 output at the 60
+        # character mark when packing strings, and we can wind up a multi-line
+        # string here.  We could reimplement the multi-line string logic,
+        # but why would we - this does just as well for producing solid output.
+        z.emit("!binary ")
+        [self].pack("m*").to_zaml(z)
+
+      # Only legal UTF-8 characters can make it this far, so we are safe
+      # against emitting something dubious. That means we don't need to mess
+      # about, just emit them directly. --daniel 2012-07-14
+      when self =~ /\n/
+        # embedded newline, split line-wise in quoted string block form.
+        if self[-1..-1] == "\n" then z.emit('|+') else z.emit('|-') end
+        z.nested { split("\n",-1).each { |line| z.nl; z.emit(line.chomp("\n")) } }
+      when ((self =~ /^[a-zA-Z\/][-\[\]_\/.:a-zA-Z0-9]*$/) and
+          (self !~ /^(?:true|false|yes|no|on|null|off)$/i))
+        # simple string literal, safe to emit unquoted.
+        z.emit(self)
+      else
+        # ...though we still have to escape unsafe characters.
+        escaped = gsub(/[\\"\x00-\x1F]/) do |c|
+          ZAML_ESCAPES[c] || "\\x#{c[0].ord.to_s(16)}"
+        end
+        z.emit("\"#{escaped}\"")
       end
-    }
+    end
   end
 
   # Return a guranteed ASCII-8BIT encoding for Ruby 1.9 This is a helper
@@ -301,12 +302,12 @@ def to_zaml(z)
   # sequences deliberately rather than dealing with characters.
   # The method may or may not return a new instance.
   if String.method_defined?(:encoding)
-    ASCII = Encoding.find("ASCII-8BIT")
+    ASCII_ENCODING = Encoding.find("ASCII-8BIT")
     def to_ascii8bit
-      if self.encoding.name == "ASCII-8BIT"
+      if self.encoding == ASCII_ENCODING
         self
       else
-        self.dup.force_encoding("ASCII-8BIT")
+        self.dup.force_encoding(ASCII_ENCODING)
       end
     end
   else
diff --git a/spec/unit/util/zaml_spec.rb b/spec/unit/util/zaml_spec.rb
index d5e8e024a3e..0086fb52763 100755
--- a/spec/unit/util/zaml_spec.rb
+++ b/spec/unit/util/zaml_spec.rb
@@ -25,12 +25,12 @@
   "-0x123abc"  => '--- "-0x123abc"',
   "-0x123"     => '--- "-0x123"',
   "+0x123"     => '--- "+0x123"',
-  "0x123.456"  => "--- 0x123.456",
+  "0x123.456"  => '--- "0x123.456"',
   'test'       => "--- test",
   []           => "--- []",
   :symbol      => "--- !ruby/sym symbol",
   {:a => "A"}  => "--- \n  !ruby/sym a: A",
-  {:a => "x\ny"} => "--- \n  !ruby/sym a: |-\n    x\n    y" 
+  {:a => "x\ny"} => "--- \n  !ruby/sym a: |-\n    x\n    y"
   }.each { |o,y|
     it "should convert the #{o.class} #{o.inspect} to yaml" do
       o.to_yaml.should == y
@@ -104,3 +104,93 @@
     end
   end
 end
+
+describe "binary data" do
+  subject { "M\xC0\xDF\xE5tt\xF6" }
+
+  if String.method_defined?(:encoding)
+    def binary(str)
+      str.force_encoding('binary')
+    end
+  else
+    def binary(str)
+      str
+    end
+  end
+
+  it "should not explode encoding binary data" do
+    expect { subject.to_yaml }.not_to raise_error
+  end
+
+  it "should mark the binary data as binary" do
+    subject.to_yaml.should =~ /!binary/
+  end
+
+  it "should round-trip the data" do
+    yaml = subject.to_yaml
+    read = YAML.load(yaml)
+    binary(read).should == binary(subject)
+  end
+
+  [
+    "\xC0\xAE",                 # over-long UTF-8 '.' character
+    "\xC0\x80",                 # over-long NULL byte
+    "\xC0\xFF",
+    "\xC1\xAE",
+    "\xC1\x80",
+    "\xC1\xFF",
+    "\x80",                     # first continuation byte
+    "\xbf",                     # last continuation byte
+    # all possible continuation bytes in one shot
+    "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F" +
+    "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F" +
+    "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" +
+    "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF",
+    # lonely start characters - first, all possible two byte sequences
+    "\xC0 \xC1 \xC2 \xC3 \xC4 \xC5 \xC6 \xC7 \xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF " +
+    "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 \xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ",
+    # and so for three byte sequences, four, five, and six, as follow.
+    "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 \xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ",
+    "\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ",
+    "\xF8 \xF9 \xFA \xFB ",
+    "\xFC \xFD ",
+    # sequences with the last byte missing
+    "\xC0", "\xE0", "\xF0\x80\x80", "\xF8\x80\x80\x80", "\xFC\x80\x80\x80\x80",
+    "\xDF", "\xEF\xBF", "\xF7\xBF\xBF", "\xFB\xBF\xBF\xBF", "\xFD\xBF\xBF\xBF\xBF",
+    # impossible bytes
+    "\xFE", "\xFF", "\xFE\xFE\xFF\xFF",
+    # over-long '/' character
+    "\xC0\xAF",
+    "\xE0\x80\xAF",
+    "\xF0\x80\x80\xAF",
+    "\xF8\x80\x80\x80\xAF",
+    "\xFC\x80\x80\x80\x80\xAF",
+    # maximum overlong sequences
+    "\xc1\xbf",
+    "\xe0\x9f\xbf",
+    "\xf0\x8f\xbf\xbf",
+    "\xf8\x87\xbf\xbf\xbf",
+    "\xfc\x83\xbf\xbf\xbf\xbf",
+    # overlong NUL
+    "\xc0\x80",
+    "\xe0\x80\x80",
+    "\xf0\x80\x80\x80",
+    "\xf8\x80\x80\x80\x80",
+    "\xfc\x80\x80\x80\x80\x80",
+  ].each do |input|
+    # It might seem like we should more correctly reject these sequences in
+    # the encoder, and I would personally agree, but the sad reality is that
+    # we do not distinguish binary and textual data in our language, and so we
+    # wind up with the same thing - a string - containing both.
+    #
+    # That leads to the position where we must treat these invalid sequences,
+    # which are both legitimate binary content, and illegitimate potential
+    # attacks on the system, as something that passes through correctly in
+    # a string. --daniel 2012-07-14
+    it "binary encode highly dubious non-compliant UTF-8 input #{input.inspect}" do
+      encoded = ZAML.dump(binary(input))
+      encoded.should =~ /!binary/
+      YAML.load(encoded).should == input
+    end
+  end
+end