First commit

silverstripe · Mar 19, 2013 · 31e6a72 · 31e6a72
commit 31e6a72
Show file tree

Hide file tree

Showing 13 changed files with 6,901 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,25 @@
+# HTML5 support for SilverStripe
+
+## Maintainer Contact
+
+* Hamish Friedlander <hamish@silverstripe.com>
+
+## Requirements
+
+* SilverStripe 3.1 Beta 3 or higher
+
+## Summary
+
+This module allows SilverStripe to support HTML 5 in HTMLText and HTMLVarchar fields, by
+providing a subclass of HTMLValue that uses the third party html5lib and causing the Injector
+to use this subclass by default.
+
+SilverStripe stores HTMLText and HTMLVarchar fields in models as strings, but
+sometimes needs to convert these to DOM objects (for instance, to process shortcodes).
+
+Default SilverStripe behavior is to do this with DOMDocument#loadHTML, but that method
+throws an error when it encounters the new HTML5 element types. It also doesn't deal
+with unclosed elements and invalid HTML in the manner prescribed by the HTML5 spec.
+
+This module replaces the code that does this conversion with code that uses html5lib, which
+supports HTML 5 as per the spec.
diff --git a/_config.php b/_config.php
@@ -0,0 +1,3 @@
+<?php
+
+define('HTML5LIB_PATH', dirname(__FILE__).'/thirdparty/html5lib');
diff --git a/_config/html.yml b/_config/html.yml
@@ -0,0 +1,6 @@
+---
+After: 'framework/html'
+---
+Injector:
+  HTMLValue:
+    class: SS_HTML5Value
diff --git a/code/HTML5Value.php b/code/HTML5Value.php
@@ -0,0 +1,35 @@
+<?php
+
+class SS_HTML5Value extends SS_HTMLValue {
+
+	public function setContent($content) {
+		require_once(HTML5LIB_PATH.'/HTML5/Parser.php');
+
+		// Convert any errors to exceptions
+		set_error_handler(
+			function($no, $str){
+				throw new Exception("HTML Parse Error: ".$str);
+			},
+			error_reporting()
+		);
+
+		// Use HTML5lib to parse the HTML fragment
+		try {
+			$document = HTML5_Parser::parse(
+				'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>'.
+				"<body>$content</body></html>"
+			);
+		}
+		catch (Exception $e) {
+			$document = false;
+		}
+
+		// Disable our error handler (restoring to previous value)
+		restore_error_handler();
+
+		// If we couldn't parse the HTML, set the error state
+		if ($document) $this->setDocument($document);
+		else $this->setInvalid();
+	}
+
+}
diff --git a/tests/HTML5ValueTest.php b/tests/HTML5ValueTest.php
@@ -0,0 +1,65 @@
+<?php
+/**
+ * @package framework
+ * @subpackage tests
+ */
+class SS_HTML5ValueTest extends SapphireTest {
+	public function testInvalidHTMLParsing() {
+		$value = new SS_HTML5Value();
+
+		$invalid = array (
+			'<p>Enclosed Value</p></p>'                              => '<p>Enclosed Value</p><p></p>',
+			'<meta content="text/html"></meta>'                      => '<meta content="text/html">',
+			'<p><div class="example"></div></p>'                     => '<p></p><div class="example"></div><p></p>'
+		);
+
+		foreach($invalid as $input => $expected) {
+			$value->setContent($input);
+			$this->assertEquals($expected, $value->getContent(), 'Invalid HTML can be parsed');
+		}
+	}
+
+	public function testUtf8Saving() {
+		$value = new SS_HTML5Value();
+
+		$value->setContent('<p>ö ß ā い 家</p>');
+		$this->assertEquals('<p>ö ß ā い 家</p>', $value->getContent());
+	}
+
+	public function testWhitespaceHandling() {
+		$value = new SS_HTML5Value();
+
+		$value->setContent('<p></p> <p></p>');
+		$this->assertEquals('<p></p> <p></p>', $value->getContent());
+	}
+
+	public function testInvalidHTMLTagNames() {
+		$value = new SS_HTML5Value();
+
+		$invalid = array(
+			'<p><div><a href="test-link"></p></div>',
+			'<html><div><a href="test-link"></a></a></html_>'
+		);
+
+		foreach($invalid as $input) {
+			$value->setContent($input);
+
+			$this->assertEquals(
+				'test-link',
+				$value->getElementsByTagName('a')->item(0)->getAttribute('href'),
+				'Link data can be extraced from malformed HTML'
+			);
+		}
+	}
+
+	public function testMixedNewlines() {
+		$value = new SS_HTML5Value();
+
+		$value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>");
+		$this->assertEquals(
+			"<p>paragraph</p>\n<ul><li>1</li>\n</ul>",
+			$value->getContent(),
+			'Newlines get converted'
+		);
+	}
+}
diff --git a/thirdparty/html5lib/HTML5/Data.php b/thirdparty/html5lib/HTML5/Data.php
@@ -0,0 +1,114 @@
+<?php
+
+// warning: this file is encoded in UTF-8!
+
+class HTML5_Data
+{
+
+    // at some point this should be moved to a .ser file. Another
+    // possible optimization is to give UTF-8 bytes, not Unicode
+    // codepoints
+    // XXX: Not quite sure why it's named this; this is
+    // actually the numeric entity dereference table.
+    protected static $realCodepointTable = array(
+        0x00 => 0xFFFD, // REPLACEMENT CHARACTER
+        0x0D => 0x000A, // LINE FEED (LF)
+        0x80 => 0x20AC, // EURO SIGN ('€')
+        0x81 => 0x0081, // <control>
+        0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
+        0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
+        0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
+        0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
+        0x86 => 0x2020, // DAGGER ('†')
+        0x87 => 0x2021, // DOUBLE DAGGER ('‡')
+        0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
+        0x89 => 0x2030, // PER MILLE SIGN ('‰')
+        0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
+        0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
+        0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
+        0x8D => 0x008D, // <control>
+        0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
+        0x8F => 0x008F, // <control>
+        0x90 => 0x0090, // <control>
+        0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
+        0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
+        0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
+        0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
+        0x95 => 0x2022, // BULLET ('•')
+        0x96 => 0x2013, // EN DASH ('–')
+        0x97 => 0x2014, // EM DASH ('—')
+        0x98 => 0x02DC, // SMALL TILDE ('˜')
+        0x99 => 0x2122, // TRADE MARK SIGN ('™')
+        0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
+        0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
+        0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
+        0x9D => 0x009D, // <control>
+        0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
+        0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
+    );
+
+    protected static $namedCharacterReferences;
+
+    protected static $namedCharacterReferenceMaxLength;
+
+    /**
+     * Returns the "real" Unicode codepoint of a malformed character
+     * reference.
+     */
+    public static function getRealCodepoint($ref) {
+        if (!isset(self::$realCodepointTable[$ref])) return false;
+        else return self::$realCodepointTable[$ref];
+    }
+
+    public static function getNamedCharacterReferences() {
+        if (!self::$namedCharacterReferences) {
+            self::$namedCharacterReferences = unserialize(
+                file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
+        }
+        return self::$namedCharacterReferences;
+    }
+
+    /**
+     * Converts a Unicode codepoint to sequence of UTF-8 bytes.
+     * @note Shamelessly stolen from HTML Purifier, which is also
+     *       shamelessly stolen from Feyd (which is in public domain).
+     */
+    public static function utf8chr($code) {
+        /* We don't care: we live dangerously
+         * if($code > 0x10FFFF or $code < 0x0 or
+          ($code >= 0xD800 and $code <= 0xDFFF) ) {
+            // bits are set outside the "valid" range as defined
+            // by UNICODE 4.1.0
+            return "\xEF\xBF\xBD";
+          }*/
+
+        $x = $y = $z = $w = 0;
+        if ($code < 0x80) {
+            // regular ASCII character
+            $x = $code;
+        } else {
+            // set up bits for UTF-8
+            $x = ($code & 0x3F) | 0x80;
+            if ($code < 0x800) {
+               $y = (($code & 0x7FF) >> 6) | 0xC0;
+            } else {
+                $y = (($code & 0xFC0) >> 6) | 0x80;
+                if($code < 0x10000) {
+                    $z = (($code >> 12) & 0x0F) | 0xE0;
+                } else {
+                    $z = (($code >> 12) & 0x3F) | 0x80;
+                    $w = (($code >> 18) & 0x07) | 0xF0;
+                }
+            }
+        }
+        // set up the actual character
+        $ret = '';
+        if($w) $ret .= chr($w);
+        if($z) $ret .= chr($z);
+        if($y) $ret .= chr($y);
+        $ret .= chr($x);
+
+        return $ret;
+    }
+
+}