Permalink
Browse files

Added additional publish date search functions to PublishDateExtracto…

…r() (#34)

* Added PublishDateExtractor->getDateFromSchemaOrg() to seek published dates from nodes using Schema.org's datePublished property.

* Added PublishDateExtractor->getDateFromDublinCore() to seek published dates from nodes using Dublin Core standards.

* Added retrieval of specific OpenGraph type values based on og:type to MetaExtractor->getOpenGraph().

* Added PublishDateExtractor->getDateFromOpenGraph() to seek published dates from nodes using Open Graph standards.

* Added PublishDateExtractor->getDateFromParsely() to seek published dates from nodes using Parsely metadata.

* Fixed a broken UTF-8 character from 2509908.

* Added parsely-page node detection to PublishDateExtractor->getDateFromParsely().

* Added unit tests for PublishDateExtractor->getDateFromSchemaOrg().

* Added unit tests for PublishDateExtractor->getDateFromDublinCore().

* Added unit tests for PublishDateExtractor->getDateFromOpenGraph().

* Corrected PublishDateExtractor->getDateFromParsely() to use "content" attribute from parsely-page node.

* Added unit tests for PublishDateExtractor->getDateFromParsely().

* Added new publish date search functions to PublishDateExtractor->run().
  • Loading branch information...
cdubz authored and scotteh committed Jun 2, 2016
1 parent 140b40d commit 7cb46acbe2fb2226f3857c161d132389b9512215
@@ -21,7 +21,7 @@ class MetaExtractor extends AbstractModule implements ModuleInterface {
/** @var string[] */
protected static $SPLITTER_CHARS = [
'|', '-', '»', ':',
'|', '-', '»', ':',
];
/** @var string */
@@ -81,6 +81,17 @@ private function getOpenGraph() {
$results[$property[1]] = $node->attr('content');
}
// Additionally retrieve type values based on provided og:type (http://ogp.me/#types)
if (isset($results['type'])) {
$nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]');
foreach ($nodes as $node) {
$property = explode(':', $node->attr('property'));
$results[$property[1]] = $node->attr('content');
}
}
return $results;
}
@@ -6,6 +6,7 @@
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\AbstractModule;
use Goose\Modules\ModuleInterface;
use DOMWrap\Element;
/**
* Publish Date Extractor
@@ -19,12 +20,32 @@ class PublishDateExtractor extends AbstractModule implements ModuleInterface {
/**
* @param Article $article
*
* @return DateTime
* @return \DateTime
*/
public function run(Article $article) {
$this->article($article);
$article->setPublishDate($this->getDateFromURL());
$dt = null;
$dt = $this->getDateFromSchemaOrg();
if (is_null($dt)) {
$dt = $this->getDateFromOpenGraph();
}
if (is_null($dt)) {
$dt = $this->getDateFromURL();
}
if (is_null($dt)) {
$dt = $this->getDateFromDublinCore();
}
if (is_null($dt)) {
$dt = $this->getDateFromParsely();
}
$article->setPublishDate($dt);
}
private function getDateFromURL() {
@@ -44,4 +65,197 @@ private function getDateFromURL() {
return null;
}
/**
* Check for and determine dates from Schema.org's datePublished property.
*
* Checks HTML tags (e.g. <meta>, <time>, etc.) and JSON-LD.
*
* @return \DateTime|null
*
* @see https://schema.org/datePublished
*/
private function getDateFromSchemaOrg() {
$dt = null;
// Check for HTML tags (<meta>, <time>, etc.)
$nodes = $this->article()->getRawDoc()->find('*[itemprop="datePublished"]');
/* @var $node Element */
foreach ($nodes as $node) {
try {
if ($node->hasAttribute('datetime')) {
$dt = new \DateTime($node->getAttribute('datetime'));
break;
}
if ($node->hasAttribute('content')) {
$dt = new \DateTime($node->getAttribute('content'));
break;
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
}
if (!is_null($dt)) {
return $dt;
}
// Check for JSON-LD
$nodes = $this->article()->getRawDoc()->find('script[type="application/ld+json"]');
/* @var $node Element */
foreach ($nodes as $node) {
try {
$json = json_decode($node->text());
if (isset($json->datePublished)) {
$dt = new \DateTime($json->datePublished);
break;
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
}
return $dt;
}
/**
* Check for and determine dates based on Dublin Core standards.
*
* @return \DateTime|null
*
* @see http://dublincore.org/documents/dcmi-terms/#elements-date
* @see http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml
*/
private function getDateFromDublinCore() {
$dt = null;
$nodes = $this->article()->getRawDoc()->find('*[name="dc.date"], *[name="dc.date.issued"], *[name="DC.date.issued"]');
/* @var $node Element */
foreach ($nodes as $node) {
try {
if ($node->hasAttribute('content')) {
$dt = new \DateTime($node->getAttribute('content'));
break;
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
}
if (!is_null($dt)) {
return $dt;
}
return $dt;
}
/**
* Check for and determine dates based on OpenGraph standards.
*
* @return \DateTime|null
*
* @see http://ogp.me/
* @see http://ogp.me/#type_article
*/
private function getDateFromOpenGraph() {
$dt = null;
$og_data = $this->article()->getOpenGraph();
try {
if (isset($og_data['published_time'])) {
$dt = new \DateTime($og_data['published_time']);
}
if (is_null($dt) && isset($og_data['pubdate'])) {
$dt = new \DateTime($og_data['pubdate']);
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
return $dt;
}
/**
* Check for and determine dates based on Parsely metadata.
*
* Checks JSON-LD, <meta> tags and parsely-page.
*
* @return \DateTime|null
*
* @see https://www.parsely.com/help/integration/jsonld/
* @see https://www.parsely.com/help/integration/metatags/
* @see https://www.parsely.com/help/integration/ppage/
*/
private function getDateFromParsely() {
$dt = null;
// JSON-LD
$nodes = $this->article()->getRawDoc()->find('script[type="application/ld+json"]');
/* @var $node Element */
foreach ($nodes as $node) {
try {
$json = json_decode($node->text());
if (isset($json->dateCreated)) {
$dt = new \DateTime($json->dateCreated);
break;
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
}
if (!is_null($dt)) {
return $dt;
}
// <meta> tags
$nodes = $this->article()->getRawDoc()->find('meta[name="parsely-pub-date"]');
/* @var $node Element */
foreach ($nodes as $node) {
try {
if ($node->hasAttribute('content')) {
$dt = new \DateTime($node->getAttribute('content'));
break;
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
}
if (!is_null($dt)) {
return $dt;
}
// parsely-page
$nodes = $this->article()->getRawDoc()->find('meta[name="parsely-page"]');
/* @var $node Element */
foreach ($nodes as $node) {
try {
if ($node->hasAttribute('content')) {
$json = json_decode($node->getAttribute('content'));
if (isset($json->pub_date)) {
$dt = new \DateTime($json->pub_date);
break;
}
}
}
catch (\Exception $e) {
// Do nothing here in case the node has unrecognizable date information.
}
}
return $dt;
}
}
Oops, something went wrong.

0 comments on commit 7cb46ac

Please sign in to comment.