Skip to content

Commit

Permalink
This release fixes an IRI parsing bug reported recently. It also
Browse files Browse the repository at this point in the history
replaces regex used to find rel links with xpath, and more cleanly
seperates microformats discovery and parsing when php-mf2 is not
included.
  • Loading branch information
mblaney committed Jun 14, 2016
1 parent 2f272a0 commit 426dc5b
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 109 deletions.
85 changes: 35 additions & 50 deletions library/SimplePie.php
Expand Up @@ -1614,25 +1614,44 @@ protected function fetch_data(&$cache)
$copyContentType = $file->headers['content-type'];
try
{
// First check for h-entry microformats in the current file.
$microformats = false;
$position = 0;
while ($position = strpos($file->body, 'h-entry', $position))
{
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if ($microformats = preg_match('/class="[^"]*h-entry/', $check))
if (function_exists('Mf2\parse')) {
// Check for both h-feed and h-entry, as both a feed with no entries
// and a list of entries without an h-feed wrapper are both valid.
$position = 0;
while ($position = strpos($file->body, 'h-feed', $position))
{
break;
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if ($microformats = preg_match('/class="[^"]*h-feed/', $check))
{
break;
}
$position += 7;
}
$position = 0;
while ($position = strpos($file->body, 'h-entry', $position))
{
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if ($microformats = preg_match('/class="[^"]*h-entry/', $check))
{
break;
}
$position += 7;
}
$position += 7;
}
// Now also do feed discovery, but if an h-entry was found don't
// overwrite the current value of file.
$discovered = $locate->find($this->autodiscovery,
$this->all_discovered_feeds);
if ($microformats)
{
if ($hub = $locate->get_rel_link('hub'))
{
$self = $locate->get_rel_link('self');
$this->store_links($file, $hub, $self);
}
// Push the current file onto all_discovered feeds so the user can
// be shown this as one of the options.
if (isset($this->all_discovered_feeds)) {
Expand Down Expand Up @@ -1681,7 +1700,6 @@ protected function fetch_data(&$cache)

$this->raw_data = $file->body;
$this->permanent_url = $file->permanent_url;
$this->store_links($file);
$headers = $file->headers;
$sniffer = $this->registry->create('Content_Type_Sniffer', array(&$file));
$sniffed = $sniffer->get_type();
Expand Down Expand Up @@ -3221,52 +3239,19 @@ public static function merge_items($urls, $start = 0, $end = 0, $limit = 0)
*
* There is no way to find PuSH links in the body of a microformats feed,
* so they are added to the headers when found, to be used later by get_links.
* @param SimplePie_File
* @param SimplePie_File $file
* @param string $hub
* @param string $self
*/
private function store_links(&$file) {
private function store_links(&$file, $hub, $self) {
if (isset($file->headers['link']['hub']) ||
(isset($file->headers['link']) &&
preg_match('/rel=hub/', $file->headers['link'])))
{
return;
}
$hub = '';
$self = '';
$position = 0;
$regex1 = '/<(?:link|a) href="([^"]*)" rel="[^"]*hub[^"]*"/';
$regex2 = '/<(?:link|a) rel="[^"]*hub[^"]*" href="([^"]*)"/';
while ($position = strpos($file->body, 'rel="hub"', $position + 7))
{
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if (preg_match($regex1, $check, $match))
{
$hub = $match[1] === '' ? $file->url : $match[1];
}
else if (preg_match($regex2, $check, $match))
{
$hub = $match[1] === '' ? $file->url : $match[1];
}
if ($hub !== '') break;
}
$position = 0;
$regex1 = '/<(?:link|a) href="([^"]*)" rel="[^"]*self[^"]*"/';
$regex2 = '/<(?:link|a) rel="[^"]*self[^"]*" href="([^"]*)"/';
while ($position = strpos($file->body, 'rel="self"', $position + 7))
{
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if (preg_match($regex1, $check, $match))
{
$self = $match[1] === '' ? $file->url : $match[1];
}
if (preg_match($regex2, $check, $match))
{
$self = $match[1] === '' ? $file->url : $match[1];
}
if ($self !== '') break;
}
if ($hub !== '')

if ($hub)
{
if (isset($file->headers['link']))
{
Expand All @@ -3280,7 +3265,7 @@ private function store_links(&$file) {
$file->headers['link'] = '';
}
$file->headers['link'] .= '<'.$hub.'>; rel=hub';
if ($self !== '')
if ($self)
{
$file->headers['link'] .= ', <'.$self.'>; rel=self';
}
Expand Down
32 changes: 14 additions & 18 deletions library/SimplePie/IRI.php
Expand Up @@ -776,24 +776,20 @@ protected function scheme_normalization()
*/
public function is_valid()
{
$isauthority = $this->iuserinfo !== null || $this->ihost !== null || $this->port !== null;
if ($this->ipath !== '' &&
(
$isauthority && (
$this->ipath[0] !== '/' ||
substr($this->ipath, 0, 2) === '//'
) ||
(
$this->scheme === null &&
!$isauthority &&
strpos($this->ipath, ':') !== false &&
(strpos($this->ipath, '/') === false ? true : strpos($this->ipath, ':') < strpos($this->ipath, '/'))
)
)
)
{
return false;
}
if ($this->ipath === '') return true;

$isauthority = $this->iuserinfo !== null || $this->ihost !== null ||
$this->port !== null;
if ($isauthority && $this->ipath[0] === '/') return true;

if (!$isauthority && (substr($this->ipath, 0, 2) === '//')) return false;

// Relative urls cannot have a colon in the first path segment (and the
// slashes themselves are not included so skip the first character).
if (!$this->scheme && !$isauthority &&
strpos($this->ipath, ':') !== false &&
strpos($this->ipath, '/', 1) !== false &&
strpos($this->ipath, ':') < strpos($this->ipath, '/', 1)) return false;

return true;
}
Expand Down
53 changes: 52 additions & 1 deletion library/SimplePie/Locator.php
Expand Up @@ -281,7 +281,7 @@ public function get_links()
{
$href = trim($link->getAttribute('href'));
$parsed = $this->registry->call('Misc', 'parse_url', array($href));
if ($parsed['scheme'] === '' || preg_match('/^(http(s)|feed)?$/i', $parsed['scheme']))
if ($parsed['scheme'] === '' || preg_match('/^(https?|feed)?$/i', $parsed['scheme']))
{
if (method_exists($link, 'getLineNo') && $this->base_location < $link->getLineNo())
{
Expand Down Expand Up @@ -318,6 +318,57 @@ public function get_links()
return null;
}

public function get_rel_link($rel)
{
if ($this->dom === null)
{
throw new SimplePie_Exception('DOMDocument not found, unable to use '.
'locator');
}
if (!class_exists('DOMXpath'))
{
throw new SimplePie_Exception('DOMXpath not found, unable to use '.
'get_rel_link');
}

$xpath = new DOMXpath($this->dom);
$query = '//a[@rel and @href] | //link[@rel and @href]';
foreach ($xpath->query($query) as $link)
{
$href = trim($link->getAttribute('href'));
$parsed = $this->registry->call('Misc', 'parse_url', array($href));
if ($parsed['scheme'] === '' ||
preg_match('/^https?$/i', $parsed['scheme']))
{
if (method_exists($link, 'getLineNo') &&
$this->base_location < $link->getLineNo())
{
$href =
$this->registry->call('Misc', 'absolutize_url',
array(trim($link->getAttribute('href')),
$this->base));
}
else
{
$href =
$this->registry->call('Misc', 'absolutize_url',
array(trim($link->getAttribute('href')),
$this->http_base));
}
if ($href === false)
{
return null;
}
$rel_values = explode(' ', strtolower($link->getAttribute('rel')));
if (in_array($rel, $rel_values))
{
return $href;
}
}
}
return null;
}

public function extension(&$array)
{
foreach ($array as $key => $value)
Expand Down
63 changes: 39 additions & 24 deletions library/SimplePie/Parser.php
Expand Up @@ -76,14 +76,27 @@ public function set_registry(SimplePie_Registry $registry)

public function parse(&$data, $encoding, $url = '')
{
$position = 0;
while ($position = strpos($data, 'h-entry', $position)) {
$start = $position < 200 ? 0 : $position - 200;
$check = substr($data, $start, 400);
if (preg_match('/class="[^"]*h-entry/', $check)) {
return $this->parse_microformats($data, $url);
if (function_exists('Mf2\parse')) {
// Check for both h-feed and h-entry, as both a feed with no entries
// and a list of entries without an h-feed wrapper are both valid.
$position = 0;
while ($position = strpos($data, 'h-feed', $position)) {
$start = $position < 200 ? 0 : $position - 200;
$check = substr($data, $start, 400);
if (preg_match('/class="[^"]*h-feed/', $check)) {
return $this->parse_microformats($data, $url);
}
$position += 7;
}
$position = 0;
while ($position = strpos($data, 'h-entry', $position)) {
$start = $position < 200 ? 0 : $position - 200;
$check = substr($data, $start, 400);
if (preg_match('/class="[^"]*h-entry/', $check)) {
return $this->parse_microformats($data, $url);
}
$position += 7;
}
$position += 7;
}

// Use UTF-8 if we get passed US-ASCII, as every US-ASCII character is a UTF-8 character
Expand Down Expand Up @@ -439,10 +452,8 @@ private function parse_hcard($data, $category = false) {
}

private function parse_microformats(&$data, $url) {
if (!function_exists('Mf2\parse')) return false;

$feed_title = '';
$icon = '';
$feed_author = NULL;
$author_cache = array();
$items = array();
$entries = array();
Expand All @@ -458,23 +469,20 @@ private function parse_microformats(&$data, $url) {
if (!isset($mf_item['children'][0]['type'])) continue;
if (in_array('h-feed', $mf_item['children'][0]['type'])) {
$h_feed = $mf_item['children'][0];
// In this case the parent of the h-feed may be an h-card, so use it as
// the feed_author.
if (in_array('h-card', $mf_item['type'])) $feed_author = $mf_item;
break;
}
}
if (isset($h_feed['children'])) {
$entries = $h_feed['children'];
// Also set the feed title and icon from the h-feed if available.
// Also set the feed title and store author from the h-feed if available.
if (isset($mf['items'][0]['properties']['name'][0])) {
$feed_title = $mf['items'][0]['properties']['name'][0];
}
if (isset($mf['items'][0]['properties']['author'][0])) {
$author = $mf['items'][0]['properties']['author'][0];
if (is_array($author) &&
isset($author['type']) && in_array('h-card', $author['type'])) {
if (isset($author['properties']['photo'][0])) {
$icon = $author['properties']['photo'][0];
}
}
$feed_author = $mf['items'][0]['properties']['author'][0];
}
}
else {
Expand All @@ -501,12 +509,13 @@ private function parse_microformats(&$data, $url) {
if (isset($title['value'])) $title = $title['value'];
$item['title'] = array(array('data' => $title));
}
if (isset($entry['properties']['author'][0])) {
if (isset($entry['properties']['author'][0]) || isset($feed_author)) {
// author is a special case, it can be plain text or an h-card array.
// If it's plain text it can also be a url that should be followed to
// get the actual h-card.
$author = $entry['properties']['author'][0];
if (is_array($author)) {
$author = isset($entry['properties']['author'][0]) ?
$entry['properties']['author'][0] : $feed_author;
if (!is_string($author)) {
$author = $this->parse_hcard($author);
}
else if (strpos($author, 'http') === 0) {
Expand Down Expand Up @@ -574,6 +583,11 @@ private function parse_microformats(&$data, $url) {
$item['title'] = array(array('data' => $title));
}
$description .= $entry['properties']['content'][0]['html'];
if (isset($entry['properties']['in-reply-to'][0]['value'])) {
$in_reply_to = $entry['properties']['in-reply-to'][0]['value'];
$description .= '<p><span class="in-reply-to"></span> '.
'<a href="'.$in_reply_to.'">'.$in_reply_to.'</a><p>';
}
$item['description'] = array(array('data' => $description));
}
if (isset($entry['properties']['category'])) {
Expand Down Expand Up @@ -608,9 +622,10 @@ private function parse_microformats(&$data, $url) {
// Mimic RSS data format when storing microformats.
$link = array(array('data' => $url));
$image = '';
if ($icon !== '') {
array(array('child' => array('' =>
array('url' => array(array('data' => $icon))))));
if (!is_string($feed_author) &&
isset($feed_author['properties']['photo'][0])) {
$image = array(array('child' => array('' => array('url' =>
array(array('data' => $feed_author['properties']['photo'][0]))))));
}
// Use the a name given for the h-feed, or get the title from the html.
if ($feed_title !== '') {
Expand Down
5 changes: 3 additions & 2 deletions library/SimplePie/Sanitize.php
Expand Up @@ -368,8 +368,9 @@ public function sanitize($data, $type, $base = '')

// Finally, convert to a HTML string
$data = trim($document->saveHTML());

list($_, $data, $_) = explode($unique_tag, $data);
$result = explode($unique_tag, $data);
// The tags may not be found again if there was invalid markup.
$data = count($result) === 3 ? $result[1] : '';

if ($this->remove_div)
{
Expand Down

0 comments on commit 426dc5b

Please sign in to comment.