Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Hamish Friedlander
committed
Mar 19, 2013
0 parents
commit 31e6a72
Showing
13 changed files
with
6,901 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# HTML5 support for SilverStripe | ||
|
||
## Maintainer Contact | ||
|
||
* Hamish Friedlander <hamish@silverstripe.com> | ||
|
||
## Requirements | ||
|
||
* SilverStripe 3.1 Beta 3 or higher | ||
|
||
## Summary | ||
|
||
This module allows SilverStripe to support HTML 5 in HTMLText and HTMLVarchar fields, by | ||
providing a subclass of HTMLValue that uses the third party html5lib and causing the Injector | ||
to use this subclass by default. | ||
|
||
SilverStripe stores HTMLText and HTMLVarchar fields in models as strings, but | ||
sometimes needs to convert these to DOM objects (for instance, to process shortcodes). | ||
|
||
Default SilverStripe behavior is to do this with DOMDocument#loadHTML, but that method | ||
throws an error when it encounters the new HTML5 element types. It also doesn't deal | ||
with unclosed elements and invalid HTML in the manner prescribed by the HTML5 spec. | ||
|
||
This module replaces the code that does this conversion with code that uses html5lib, which | ||
supports HTML 5 as per the spec. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
<?php | ||
|
||
define('HTML5LIB_PATH', dirname(__FILE__).'/thirdparty/html5lib'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
--- | ||
After: 'framework/html' | ||
--- | ||
Injector: | ||
HTMLValue: | ||
class: SS_HTML5Value |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
<?php | ||
|
||
class SS_HTML5Value extends SS_HTMLValue { | ||
|
||
public function setContent($content) { | ||
require_once(HTML5LIB_PATH.'/HTML5/Parser.php'); | ||
|
||
// Convert any errors to exceptions | ||
set_error_handler( | ||
function($no, $str){ | ||
throw new Exception("HTML Parse Error: ".$str); | ||
}, | ||
error_reporting() | ||
); | ||
|
||
// Use HTML5lib to parse the HTML fragment | ||
try { | ||
$document = HTML5_Parser::parse( | ||
'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>'. | ||
"<body>$content</body></html>" | ||
); | ||
} | ||
catch (Exception $e) { | ||
$document = false; | ||
} | ||
|
||
// Disable our error handler (restoring to previous value) | ||
restore_error_handler(); | ||
|
||
// If we couldn't parse the HTML, set the error state | ||
if ($document) $this->setDocument($document); | ||
else $this->setInvalid(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
<?php | ||
/** | ||
* @package framework | ||
* @subpackage tests | ||
*/ | ||
class SS_HTML5ValueTest extends SapphireTest { | ||
public function testInvalidHTMLParsing() { | ||
$value = new SS_HTML5Value(); | ||
|
||
$invalid = array ( | ||
'<p>Enclosed Value</p></p>' => '<p>Enclosed Value</p><p></p>', | ||
'<meta content="text/html"></meta>' => '<meta content="text/html">', | ||
'<p><div class="example"></div></p>' => '<p></p><div class="example"></div><p></p>' | ||
); | ||
|
||
foreach($invalid as $input => $expected) { | ||
$value->setContent($input); | ||
$this->assertEquals($expected, $value->getContent(), 'Invalid HTML can be parsed'); | ||
} | ||
} | ||
|
||
public function testUtf8Saving() { | ||
$value = new SS_HTML5Value(); | ||
|
||
$value->setContent('<p>ö ß ā い 家</p>'); | ||
$this->assertEquals('<p>ö ß ā い 家</p>', $value->getContent()); | ||
} | ||
|
||
public function testWhitespaceHandling() { | ||
$value = new SS_HTML5Value(); | ||
|
||
$value->setContent('<p></p> <p></p>'); | ||
$this->assertEquals('<p></p> <p></p>', $value->getContent()); | ||
} | ||
|
||
public function testInvalidHTMLTagNames() { | ||
$value = new SS_HTML5Value(); | ||
|
||
$invalid = array( | ||
'<p><div><a href="test-link"></p></div>', | ||
'<html><div><a href="test-link"></a></a></html_>' | ||
); | ||
|
||
foreach($invalid as $input) { | ||
$value->setContent($input); | ||
|
||
$this->assertEquals( | ||
'test-link', | ||
$value->getElementsByTagName('a')->item(0)->getAttribute('href'), | ||
'Link data can be extraced from malformed HTML' | ||
); | ||
} | ||
} | ||
|
||
public function testMixedNewlines() { | ||
$value = new SS_HTML5Value(); | ||
|
||
$value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>"); | ||
$this->assertEquals( | ||
"<p>paragraph</p>\n<ul><li>1</li>\n</ul>", | ||
$value->getContent(), | ||
'Newlines get converted' | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
<?php | ||
|
||
// warning: this file is encoded in UTF-8! | ||
|
||
class HTML5_Data | ||
{ | ||
|
||
// at some point this should be moved to a .ser file. Another | ||
// possible optimization is to give UTF-8 bytes, not Unicode | ||
// codepoints | ||
// XXX: Not quite sure why it's named this; this is | ||
// actually the numeric entity dereference table. | ||
protected static $realCodepointTable = array( | ||
0x00 => 0xFFFD, // REPLACEMENT CHARACTER | ||
0x0D => 0x000A, // LINE FEED (LF) | ||
0x80 => 0x20AC, // EURO SIGN ('€') | ||
0x81 => 0x0081, // <control> | ||
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') | ||
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') | ||
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') | ||
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') | ||
0x86 => 0x2020, // DAGGER ('†') | ||
0x87 => 0x2021, // DOUBLE DAGGER ('‡') | ||
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') | ||
0x89 => 0x2030, // PER MILLE SIGN ('‰') | ||
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') | ||
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') | ||
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') | ||
0x8D => 0x008D, // <control> | ||
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') | ||
0x8F => 0x008F, // <control> | ||
0x90 => 0x0090, // <control> | ||
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') | ||
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') | ||
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') | ||
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') | ||
0x95 => 0x2022, // BULLET ('•') | ||
0x96 => 0x2013, // EN DASH ('–') | ||
0x97 => 0x2014, // EM DASH ('—') | ||
0x98 => 0x02DC, // SMALL TILDE ('˜') | ||
0x99 => 0x2122, // TRADE MARK SIGN ('™') | ||
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') | ||
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') | ||
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') | ||
0x9D => 0x009D, // <control> | ||
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') | ||
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') | ||
); | ||
|
||
protected static $namedCharacterReferences; | ||
|
||
protected static $namedCharacterReferenceMaxLength; | ||
|
||
/** | ||
* Returns the "real" Unicode codepoint of a malformed character | ||
* reference. | ||
*/ | ||
public static function getRealCodepoint($ref) { | ||
if (!isset(self::$realCodepointTable[$ref])) return false; | ||
else return self::$realCodepointTable[$ref]; | ||
} | ||
|
||
public static function getNamedCharacterReferences() { | ||
if (!self::$namedCharacterReferences) { | ||
self::$namedCharacterReferences = unserialize( | ||
file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); | ||
} | ||
return self::$namedCharacterReferences; | ||
} | ||
|
||
/** | ||
* Converts a Unicode codepoint to sequence of UTF-8 bytes. | ||
* @note Shamelessly stolen from HTML Purifier, which is also | ||
* shamelessly stolen from Feyd (which is in public domain). | ||
*/ | ||
public static function utf8chr($code) { | ||
/* We don't care: we live dangerously | ||
* if($code > 0x10FFFF or $code < 0x0 or | ||
($code >= 0xD800 and $code <= 0xDFFF) ) { | ||
// bits are set outside the "valid" range as defined | ||
// by UNICODE 4.1.0 | ||
return "\xEF\xBF\xBD"; | ||
}*/ | ||
|
||
$x = $y = $z = $w = 0; | ||
if ($code < 0x80) { | ||
// regular ASCII character | ||
$x = $code; | ||
} else { | ||
// set up bits for UTF-8 | ||
$x = ($code & 0x3F) | 0x80; | ||
if ($code < 0x800) { | ||
$y = (($code & 0x7FF) >> 6) | 0xC0; | ||
} else { | ||
$y = (($code & 0xFC0) >> 6) | 0x80; | ||
if($code < 0x10000) { | ||
$z = (($code >> 12) & 0x0F) | 0xE0; | ||
} else { | ||
$z = (($code >> 12) & 0x3F) | 0x80; | ||
$w = (($code >> 18) & 0x07) | 0xF0; | ||
} | ||
} | ||
} | ||
// set up the actual character | ||
$ret = ''; | ||
if($w) $ret .= chr($w); | ||
if($z) $ret .= chr($z); | ||
if($y) $ret .= chr($y); | ||
$ret .= chr($x); | ||
|
||
return $ret; | ||
} | ||
|
||
} |
Oops, something went wrong.