Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Hamish Friedlander committed Mar 19, 2013
0 parents commit 31e6a72
Show file tree
Hide file tree
Showing 13 changed files with 6,901 additions and 0 deletions.
25 changes: 25 additions & 0 deletions README.md
@@ -0,0 +1,25 @@
# HTML5 support for SilverStripe

## Maintainer Contact

* Hamish Friedlander <hamish@silverstripe.com>

## Requirements

* SilverStripe 3.1 Beta 3 or higher

## Summary

This module allows SilverStripe to support HTML 5 in HTMLText and HTMLVarchar fields, by
providing a subclass of HTMLValue that uses the third party html5lib and causing the Injector
to use this subclass by default.

SilverStripe stores HTMLText and HTMLVarchar fields in models as strings, but
sometimes needs to convert these to DOM objects (for instance, to process shortcodes).

Default SilverStripe behavior is to do this with DOMDocument#loadHTML, but that method
throws an error when it encounters the new HTML5 element types. It also doesn't deal
with unclosed elements and invalid HTML in the manner prescribed by the HTML5 spec.

This module replaces the code that does this conversion with code that uses html5lib, which
supports HTML 5 as per the spec.
3 changes: 3 additions & 0 deletions _config.php
@@ -0,0 +1,3 @@
<?php

define('HTML5LIB_PATH', dirname(__FILE__).'/thirdparty/html5lib');
6 changes: 6 additions & 0 deletions _config/html.yml
@@ -0,0 +1,6 @@
---
After: 'framework/html'
---
Injector:
HTMLValue:
class: SS_HTML5Value
35 changes: 35 additions & 0 deletions code/HTML5Value.php
@@ -0,0 +1,35 @@
<?php

class SS_HTML5Value extends SS_HTMLValue {

public function setContent($content) {
require_once(HTML5LIB_PATH.'/HTML5/Parser.php');

// Convert any errors to exceptions
set_error_handler(
function($no, $str){
throw new Exception("HTML Parse Error: ".$str);
},
error_reporting()
);

// Use HTML5lib to parse the HTML fragment
try {
$document = HTML5_Parser::parse(
'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>'.
"<body>$content</body></html>"
);
}
catch (Exception $e) {
$document = false;
}

// Disable our error handler (restoring to previous value)
restore_error_handler();

// If we couldn't parse the HTML, set the error state
if ($document) $this->setDocument($document);
else $this->setInvalid();
}

}
65 changes: 65 additions & 0 deletions tests/HTML5ValueTest.php
@@ -0,0 +1,65 @@
<?php
/**
* @package framework
* @subpackage tests
*/
class SS_HTML5ValueTest extends SapphireTest {
public function testInvalidHTMLParsing() {
$value = new SS_HTML5Value();

$invalid = array (
'<p>Enclosed Value</p></p>' => '<p>Enclosed Value</p><p></p>',
'<meta content="text/html"></meta>' => '<meta content="text/html">',
'<p><div class="example"></div></p>' => '<p></p><div class="example"></div><p></p>'
);

foreach($invalid as $input => $expected) {
$value->setContent($input);
$this->assertEquals($expected, $value->getContent(), 'Invalid HTML can be parsed');
}
}

public function testUtf8Saving() {
$value = new SS_HTML5Value();

$value->setContent('<p>ö ß ā い 家</p>');
$this->assertEquals('<p>ö ß ā い 家</p>', $value->getContent());
}

public function testWhitespaceHandling() {
$value = new SS_HTML5Value();

$value->setContent('<p></p> <p></p>');
$this->assertEquals('<p></p> <p></p>', $value->getContent());
}

public function testInvalidHTMLTagNames() {
$value = new SS_HTML5Value();

$invalid = array(
'<p><div><a href="test-link"></p></div>',
'<html><div><a href="test-link"></a></a></html_>'
);

foreach($invalid as $input) {
$value->setContent($input);

$this->assertEquals(
'test-link',
$value->getElementsByTagName('a')->item(0)->getAttribute('href'),
'Link data can be extraced from malformed HTML'
);
}
}

public function testMixedNewlines() {
$value = new SS_HTML5Value();

$value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>");
$this->assertEquals(
"<p>paragraph</p>\n<ul><li>1</li>\n</ul>",
$value->getContent(),
'Newlines get converted'
);
}
}
114 changes: 114 additions & 0 deletions thirdparty/html5lib/HTML5/Data.php
@@ -0,0 +1,114 @@
<?php

// warning: this file is encoded in UTF-8!

class HTML5_Data
{

// at some point this should be moved to a .ser file. Another
// possible optimization is to give UTF-8 bytes, not Unicode
// codepoints
// XXX: Not quite sure why it's named this; this is
// actually the numeric entity dereference table.
protected static $realCodepointTable = array(
0x00 => 0xFFFD, // REPLACEMENT CHARACTER
0x0D => 0x000A, // LINE FEED (LF)
0x80 => 0x20AC, // EURO SIGN ('€')
0x81 => 0x0081, // <control>
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
0x86 => 0x2020, // DAGGER ('†')
0x87 => 0x2021, // DOUBLE DAGGER ('‡')
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
0x89 => 0x2030, // PER MILLE SIGN ('‰')
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
0x8D => 0x008D, // <control>
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
0x8F => 0x008F, // <control>
0x90 => 0x0090, // <control>
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
0x95 => 0x2022, // BULLET ('•')
0x96 => 0x2013, // EN DASH ('–')
0x97 => 0x2014, // EM DASH ('—')
0x98 => 0x02DC, // SMALL TILDE ('˜')
0x99 => 0x2122, // TRADE MARK SIGN ('™')
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
0x9D => 0x009D, // <control>
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
);

protected static $namedCharacterReferences;

protected static $namedCharacterReferenceMaxLength;

/**
* Returns the "real" Unicode codepoint of a malformed character
* reference.
*/
public static function getRealCodepoint($ref) {
if (!isset(self::$realCodepointTable[$ref])) return false;
else return self::$realCodepointTable[$ref];
}

public static function getNamedCharacterReferences() {
if (!self::$namedCharacterReferences) {
self::$namedCharacterReferences = unserialize(
file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
}
return self::$namedCharacterReferences;
}

/**
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
* @note Shamelessly stolen from HTML Purifier, which is also
* shamelessly stolen from Feyd (which is in public domain).
*/
public static function utf8chr($code) {
/* We don't care: we live dangerously
* if($code > 0x10FFFF or $code < 0x0 or
($code >= 0xD800 and $code <= 0xDFFF) ) {
// bits are set outside the "valid" range as defined
// by UNICODE 4.1.0
return "\xEF\xBF\xBD";
}*/

$x = $y = $z = $w = 0;
if ($code < 0x80) {
// regular ASCII character
$x = $code;
} else {
// set up bits for UTF-8
$x = ($code & 0x3F) | 0x80;
if ($code < 0x800) {
$y = (($code & 0x7FF) >> 6) | 0xC0;
} else {
$y = (($code & 0xFC0) >> 6) | 0x80;
if($code < 0x10000) {
$z = (($code >> 12) & 0x0F) | 0xE0;
} else {
$z = (($code >> 12) & 0x3F) | 0x80;
$w = (($code >> 18) & 0x07) | 0xF0;
}
}
}
// set up the actual character
$ret = '';
if($w) $ret .= chr($w);
if($z) $ret .= chr($z);
if($y) $ret .= chr($y);
$ret .= chr($x);

return $ret;
}

}

0 comments on commit 31e6a72

Please sign in to comment.