Permalink
Browse files

Make ShortcodeParser use HTMLValue for HTML parsing

  • Loading branch information...
Hamish Friedlander authored and sminnee committed Mar 11, 2013
1 parent 168f071 commit 53128c5d2ff44cdb958e69a2dd4a1ebb9355929a
View
@@ -123,6 +123,20 @@ public function __call($method, $arguments) {
}
}
+ /**
+ * Get the body element, or false if there isn't one (we haven't loaded any content
+ * or this instance is in an invalid state)
+ */
+ public function getBody() {
+ $doc = $this->getDocument();
+ if (!$doc) return false;
+
+ $body = $doc->getElementsByTagName('body');
+ if (!$body->length) return false;
+
+ return $body->item(0);
+ }
+
/**
* Make an xpath query against this HTML
*
@@ -321,9 +321,8 @@ protected function replaceTagsWithText($content, $tags, $generator) {
*
* @param DOMDocument $doc
*/
- protected function replaceAttributeTagsWithContent($doc) {
- $xp = new DOMXPath($doc);
- $attributes = $xp->query('//@*[contains(.,"[")][contains(.,"]")]');
+ protected function replaceAttributeTagsWithContent($htmlvalue) {
+ $attributes = $htmlvalue->query('//@*[contains(.,"[")][contains(.,"]")]');
$parser = $this;
for($i = 0; $i < $attributes->length; $i++) {
@@ -462,7 +461,7 @@ protected function moveMarkerToCompliantHome($node, $parent, $location) {
}
/**
- * Given a node with represents a shortcode marker and some informationabout the shortcode, call the
+ * Given a node with represents a shortcode marker and some information about the shortcode, call the
* shortcode handler & replace the marker with the actual content
*
* @param DOMElement $node
@@ -488,57 +487,14 @@ protected function replaceMarkerWithContent($node, $tag) {
}
if ($content) {
- $parsed = HTML5_Parser::parseFragment($content, 'div');
- $this->insertListAfter($parsed, $node);
+ $parsed = Injector::inst()->create('HTMLValue', $content);
+ $body = $parsed->getBody();
+ if ($body) $this->insertListAfter($body->childNodes, $node);
}
$this->removeNode($node);
}
- protected function loadHTML($html) {
- require_once(THIRDPARTY_PATH.'/html5lib/HTML5/Parser.php');
-
- // Convert any errors to exceptions
- set_error_handler(
- function($no, $str){
- throw new Exception("HTML Parse Error: ".$str);
- },
- error_reporting()
- );
-
- // Use HTML5lib to parse the HTML fragment
- try {
- $bases = HTML5_Parser::parseFragment(trim($html), 'div');
- }
- catch (Exception $e) {
- $bases = null;
- }
-
- // Disable our error handler (restoring to previous value)
- restore_error_handler();
-
- return $bases;
- }
-
- protected function saveHTML($doc) {
- if (version_compare(PHP_VERSION, '5.3.6', '>=')){
- $res = '';
- foreach($doc->firstChild->childNodes as $child) $res .= $doc->saveHTML($child);
- }
- else {
- $res = preg_replace(
- array(
- '/^(.*?)<html>/is',
- '/<\/html>(.*?)$/is',
- ),
- '',
- $doc->saveHTML()
- );
- }
-
- return $res;
- }
-
/**
* Parse a string, and replace any registered shortcodes within it with the result of the mapped callback.
*
@@ -556,11 +512,10 @@ public function parse($content) {
// use a proper DOM
list($content, $tags) = $this->replaceElementTagsWithMarkers($content);
+ $htmlvalue = Injector::inst()->create('HTMLValue', $content);
+
// Now parse the result into a DOM
- $bases = $this->loadHTML($content);
-
- // If we couldn't parse the HTML, error out
- if (!$bases || !$bases->length) {
+ if (!$htmlvalue->isValid()){
if(self::$error_behavior == self::ERROR) {
user_error('Couldn\'t decode HTML when processing short codes', E_USER_ERRROR);
}
@@ -569,15 +524,11 @@ public function parse($content) {
}
}
- $doc = $bases->item(0)->ownerDocument;
-
- $xp = new DOMXPath($doc);
-
// First, replace any shortcodes that are in attributes
- $this->replaceAttributeTagsWithContent($doc);
+ $this->replaceAttributeTagsWithContent($htmlvalue);
// Find all the element scoped shortcode markers
- $shortcodes = $xp->query('//img[@class="'.self::$marker_class.'"]');
+ $shortcodes = $htmlvalue->query('//img[@class="'.self::$marker_class.'"]');
// Find the parents. Do this before DOM modification, since SPLIT might cause parents to move otherwise
$parents = $this->findParentsForMarkers($shortcodes);
@@ -605,8 +556,8 @@ public function parse($content) {
$this->replaceMarkerWithContent($shortcode, $tag);
}
-
- return $this->saveHTML($doc);
+
+ return $htmlvalue->getContent();
}
@@ -1,114 +0,0 @@
-<?php
-
-// warning: this file is encoded in UTF-8!
-
-class HTML5_Data
-{
-
- // at some point this should be moved to a .ser file. Another
- // possible optimization is to give UTF-8 bytes, not Unicode
- // codepoints
- // XXX: Not quite sure why it's named this; this is
- // actually the numeric entity dereference table.
- protected static $realCodepointTable = array(
- 0x00 => 0xFFFD, // REPLACEMENT CHARACTER
- 0x0D => 0x000A, // LINE FEED (LF)
- 0x80 => 0x20AC, // EURO SIGN ('€')
- 0x81 => 0x0081, // <control>
- 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
- 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
- 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
- 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
- 0x86 => 0x2020, // DAGGER ('†')
- 0x87 => 0x2021, // DOUBLE DAGGER ('‡')
- 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
- 0x89 => 0x2030, // PER MILLE SIGN ('‰')
- 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
- 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
- 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
- 0x8D => 0x008D, // <control>
- 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
- 0x8F => 0x008F, // <control>
- 0x90 => 0x0090, // <control>
- 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
- 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
- 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
- 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
- 0x95 => 0x2022, // BULLET ('•')
- 0x96 => 0x2013, // EN DASH ('–')
- 0x97 => 0x2014, // EM DASH ('—')
- 0x98 => 0x02DC, // SMALL TILDE ('˜')
- 0x99 => 0x2122, // TRADE MARK SIGN ('™')
- 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
- 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
- 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
- 0x9D => 0x009D, // <control>
- 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
- 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
- );
-
- protected static $namedCharacterReferences;
-
- protected static $namedCharacterReferenceMaxLength;
-
- /**
- * Returns the "real" Unicode codepoint of a malformed character
- * reference.
- */
- public static function getRealCodepoint($ref) {
- if (!isset(self::$realCodepointTable[$ref])) return false;
- else return self::$realCodepointTable[$ref];
- }
-
- public static function getNamedCharacterReferences() {
- if (!self::$namedCharacterReferences) {
- self::$namedCharacterReferences = unserialize(
- file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
- }
- return self::$namedCharacterReferences;
- }
-
- /**
- * Converts a Unicode codepoint to sequence of UTF-8 bytes.
- * @note Shamelessly stolen from HTML Purifier, which is also
- * shamelessly stolen from Feyd (which is in public domain).
- */
- public static function utf8chr($code) {
- /* We don't care: we live dangerously
- * if($code > 0x10FFFF or $code < 0x0 or
- ($code >= 0xD800 and $code <= 0xDFFF) ) {
- // bits are set outside the "valid" range as defined
- // by UNICODE 4.1.0
- return "\xEF\xBF\xBD";
- }*/
-
- $x = $y = $z = $w = 0;
- if ($code < 0x80) {
- // regular ASCII character
- $x = $code;
- } else {
- // set up bits for UTF-8
- $x = ($code & 0x3F) | 0x80;
- if ($code < 0x800) {
- $y = (($code & 0x7FF) >> 6) | 0xC0;
- } else {
- $y = (($code & 0xFC0) >> 6) | 0x80;
- if($code < 0x10000) {
- $z = (($code >> 12) & 0x0F) | 0xE0;
- } else {
- $z = (($code >> 12) & 0x3F) | 0x80;
- $w = (($code >> 18) & 0x07) | 0xF0;
- }
- }
- }
- // set up the actual character
- $ret = '';
- if($w) $ret .= chr($w);
- if($z) $ret .= chr($z);
- if($y) $ret .= chr($y);
- $ret .= chr($x);
-
- return $ret;
- }
-
-}
Oops, something went wrong.

0 comments on commit 53128c5

Please sign in to comment.