Merge pull request #101 from joaquingx/add-support-dublin-core-metadata

[MRG+1] Add dublincore metadata
scrapinghub · Oct 6, 2020 · d78167c · d78167c
2 parents 7d245b3 + 043a479
commit d78167c
Show file tree

Hide file tree

Showing 10 changed files with 417 additions and 8 deletions.
diff --git a/README.rst b/README.rst
@@ -24,6 +24,7 @@ Currently, *extruct* supports:
 - `Microformat`_ via `mf2py`_
 - `Facebook's Open Graph`_
 - (experimental) `RDFa`_ via `rdflib`_
+- `Dublin Core Metadata (DC-HTML-2003)`_
 
 .. _W3C's HTML Microdata: http://www.w3.org/TR/microdata/
 .. _embedded JSON-LD: http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents
@@ -32,6 +33,7 @@ Currently, *extruct* supports:
 .. _Microformat: http://microformats.org/wiki/Main_Page
 .. _mf2py: https://github.com/microformats/mf2py
 .. _Facebook's Open Graph: http://ogp.me/
+.. _Dublin Core Metadata (DC-HTML-2003): https://www.dublincore.org/specifications/dublin-core/dcq-html/2003-11-30/
 
 The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions.
 
@@ -71,7 +73,17 @@ First fetch the HTML using python-requests and then feed the response body to ``
   >>> data = extruct.extract(r.text, base_url=base_url)
   >>>
   >>> pp.pprint(data)
-  { 'json-ld': [ { '@context': 'https://schema.org',
+  { 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description',
+                                        'content': 'What is Open Graph Protocol '
+                                                   'and why you need it? Learn to '
+                                                   'implement Open Graph Protocol '
+                                                   'for Facebook on your website. '
+                                                   'Open Graph Protocol Meta Tags.',
+                                        'name': 'description'}],
+                        'namespaces': {},
+                        'terms': []}],
+
+  'json-ld': [ { '@context': 'https://schema.org',
                    '@id': '#organization',
                    '@type': 'Organization',
                    'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg',
@@ -163,7 +175,7 @@ First fetch the HTML using python-requests and then feed the response body to ``
 
 Select syntaxes
 +++++++++++++++
-It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::
+It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa' and 'dublincore'. If no list is passed all syntaxes will be extracted and returned::
 
   >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
   >>> base_url = get_base_url(r.text, r.url)
@@ -207,9 +219,9 @@ It is possible to select which syntaxes to extract by passing a list with the de
 
 Uniform
 +++++++
-Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: ::
+Another option is to uniform the output of microformat, opengraph, microdata, dublincore and json-ld syntaxes to the following structure: ::
 
-    {'@context': 'http://example.com', 
+    {'@context': 'http://example.com',
                  '@type': 'example_type',
                  /* All other the properties in keys here */
                  }
@@ -584,6 +596,80 @@ Microformat extraction
       }
    }]
 
+DublinCore extraction
+++++++++++++++++++++++++++++++
+::
+
+    >>> import pprint
+    >>> pp = pprint.PrettyPrinter(indent=2)
+    >>> from extruct.dublincore import DublinCoreExtractor
+    >>> html = '''<head profile="http://dublincore.org/documents/dcq-html/">
+    ... <title>Expressing Dublin Core in HTML/XHTML meta and link elements</title>
+    ... <link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+    ... <link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
+    ...
+    ...
+    ... <meta name="DC.title" lang="en" content="Expressing Dublin Core
+    ... in HTML/XHTML meta and link elements" />
+    ... <meta name="DC.creator" content="Andy Powell, UKOLN, University of Bath" />
+    ... <meta name="DCTERMS.issued" scheme="DCTERMS.W3CDTF" content="2003-11-01" />
+    ... <meta name="DC.identifier" scheme="DCTERMS.URI"
+    ... content="http://dublincore.org/documents/dcq-html/" />
+    ... <link rel="DCTERMS.replaces" hreflang="en"
+    ... href="http://dublincore.org/documents/2000/08/15/dcq-html/" />
+    ... <meta name="DCTERMS.abstract" content="This document describes how
+    ... qualified Dublin Core metadata can be encoded
+    ... in HTML/XHTML &lt;meta&gt; elements" />
+    ... <meta name="DC.format" scheme="DCTERMS.IMT" content="text/html" />
+    ... <meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
+    ... <meta name="DC.Date.modified" content="2001-07-18" />
+    ... <meta name="DCTERMS.modified" content="2001-07-18" />'''
+    >>> dublinlde = DublinCoreExtractor()
+    >>> data = dublinlde.extract(html)
+    >>> pp.pprint(data)
+    [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/title',
+                        'content': 'Expressing Dublin Core\n'
+                                   'in HTML/XHTML meta and link elements',
+                        'lang': 'en',
+                        'name': 'DC.title'},
+                      { 'URI': 'http://purl.org/dc/elements/1.1/creator',
+                        'content': 'Andy Powell, UKOLN, University of Bath',
+                        'name': 'DC.creator'},
+                      { 'URI': 'http://purl.org/dc/elements/1.1/identifier',
+                        'content': 'http://dublincore.org/documents/dcq-html/',
+                        'name': 'DC.identifier',
+                        'scheme': 'DCTERMS.URI'},
+                      { 'URI': 'http://purl.org/dc/elements/1.1/format',
+                        'content': 'text/html',
+                        'name': 'DC.format',
+                        'scheme': 'DCTERMS.IMT'},
+                      { 'URI': 'http://purl.org/dc/elements/1.1/type',
+                        'content': 'Text',
+                        'name': 'DC.type',
+                        'scheme': 'DCTERMS.DCMIType'}],
+        'namespaces': { 'DC': 'http://purl.org/dc/elements/1.1/',
+                        'DCTERMS': 'http://purl.org/dc/terms/'},
+        'terms': [ { 'URI': 'http://purl.org/dc/terms/issued',
+                     'content': '2003-11-01',
+                     'name': 'DCTERMS.issued',
+                     'scheme': 'DCTERMS.W3CDTF'},
+                   { 'URI': 'http://purl.org/dc/terms/abstract',
+                     'content': 'This document describes how\n'
+                                'qualified Dublin Core metadata can be encoded\n'
+                                'in HTML/XHTML <meta> elements',
+                     'name': 'DCTERMS.abstract'},
+                   { 'URI': 'http://purl.org/dc/terms/modified',
+                     'content': '2001-07-18',
+                     'name': 'DC.Date.modified'},
+                   { 'URI': 'http://purl.org/dc/terms/modified',
+                     'content': '2001-07-18',
+                     'name': 'DCTERMS.modified'},
+                   { 'URI': 'http://purl.org/dc/terms/replaces',
+                     'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/',
+                     'hreflang': 'en',
+                     'rel': 'DCTERMS.replaces'}]}]
+
+
 
 Command Line Tool
 -----------------
@@ -622,7 +708,7 @@ those, you can pass their individual names collected in a list through 'syntaxes
 For example, this command extracts only Microdata and JSON-LD metadata from
 "http://example.com"::
 
-    extruct "http://example.com" --syntaxes microdata json-ld 
+    extruct "http://example.com" --syntaxes microdata json-ld
 
 NB syntaxes names passed must correspond to these: microdata, json-ld, rdfa, opengraph, microformat
 

diff --git a/extruct/_extruct.py b/extruct/_extruct.py
@@ -6,11 +6,12 @@
 from extruct.w3cmicrodata import MicrodataExtractor
 from extruct.opengraph import OpenGraphExtractor
 from extruct.microformat import MicroformatExtractor
-from extruct.uniform import _umicrodata_microformat, _uopengraph
+from extruct.dublincore import DublinCoreExtractor
+from extruct.uniform import _umicrodata_microformat, _uopengraph, _udublincore
 from extruct.utils import parse_xmldom_html
 
 logger = logging.getLogger(__name__)
-SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
+SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore']
 
 
 def extract(htmlstring,
@@ -96,6 +97,11 @@ def extract(htmlstring,
             ('rdfa', RDFaExtractor().extract_items,
              tree,
              ))
+    if 'dublincore' in syntaxes:
+        processors.append(
+            ('dublincore', DublinCoreExtractor().extract_items,
+             tree,
+             ))
     output = {}
     for syntax, extract, document in processors:
         try:
@@ -132,10 +138,20 @@ def extract(htmlstring,
                  output['opengraph'],
                  None,
                  ))
+        if 'dublincore' in syntaxes:
+            uniform_processors.append(
+                ('dublincore',
+                 _udublincore,
+                 output['dublincore'],
+                 None,
+                 ))
+
         for syntax, uniform, raw, schema_context in uniform_processors:
             try:
                 if syntax == 'opengraph':
                     output[syntax] = uniform(raw, with_og_array=with_og_array)
+                elif syntax == 'dublincore':
+                    output[syntax] = uniform(raw)
                 else:
                     output[syntax] = uniform(raw, schema_context)
             except Exception as e:

diff --git a/extruct/dublincore.py b/extruct/dublincore.py
@@ -0,0 +1,156 @@
+import re
+
+from extruct.utils import parse_html
+from w3lib.html import strip_html5_whitespace
+
+_DC_ELEMENTS = {  # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/
+    'contributor': 'http://purl.org/dc/elements/1.1/contributor',
+    'coverage': 'http://purl.org/dc/elements/1.1/coverage',
+    'creator': 'http://purl.org/dc/elements/1.1/creator',
+    'date': 'http://purl.org/dc/elements/1.1/date',
+    'description': 'http://purl.org/dc/elements/1.1/description',
+    'format': 'http://purl.org/dc/elements/1.1/format',
+    'identifier': 'http://purl.org/dc/elements/1.1/identifier',
+    'language': 'http://purl.org/dc/elements/1.1/language',
+    'publisher': 'http://purl.org/dc/elements/1.1/publiser',
+    'relation': 'http://purl.org/dc/elements/1.1/relation',
+    'rights': 'http://purl.org/dc/elements/1.1/rights',
+    'source': 'http://purl.org/dc/elements/1.1/source',
+    'subject': 'http://purl.org/dc/elements/1.1/subject',
+    'title': 'http://purl.org/dc/elements/1.1/title',
+    'type': 'http://purl.org/dc/elements/1.1/type',
+}
+
+_DC_TERMS = {  # Defined according: http://dublincore.org/documents/2008/01/14/dcmi-terms/
+    'abstract': 'http://purl.org/dc/terms/abstract',
+    'description': 'http://purl.org/dc/terms/description',
+    'accessrights': 'http://purl.org/dc/terms/accessRights',
+    'rights': 'http://purl.org/dc/terms/rights',
+    'rightsstatement': 'http://purl.org/dc/terms/RightsStatement',
+    'accrualmethod': 'http://purl.org/dc/terms/accrualMethod',
+    'collection': 'http://purl.org/dc/terms/Collection',
+    'methodOfaccrual': 'http://purl.org/dc/terms/MethodOfAccrual',
+    'accrualperiodicity': 'http://purl.org/dc/terms/accrualPeriodicity',
+    'frequency': 'http://purl.org/dc/terms/Frequency',
+    'accrualpolicy': 'http://purl.org/dc/terms/accrualPolicy',
+    'policy': 'http://purl.org/dc/terms/Policy',
+    'alternative': 'http://purl.org/dc/terms/alternative',
+    'title': 'http://purl.org/dc/terms/title',
+    'audience': 'http://purl.org/dc/terms/audience',
+    'agentclass': 'http://purl.org/dc/terms/AgentClass',
+    'available': 'http://purl.org/dc/terms/available',
+    'date': 'http://purl.org/dc/terms/date',
+    'bibliographiccitation': 'http://purl.org/dc/terms/bibliographicCitation',
+    'identifier': 'http://purl.org/dc/terms/identifier',
+    'bibliographicresource': 'http://purl.org/dc/terms/BibliographicResource',
+    'conformsto': 'http://purl.org/dc/terms/conformsTo',
+    'relation': 'http://purl.org/dc/terms/relation',
+    'standard': 'http://purl.org/dc/terms/Standard',
+    'contributor': 'http://purl.org/dc/terms/contributor',
+    'agent': 'http://purl.org/dc/terms/Agent',
+    'coverage': 'http://purl.org/dc/terms/coverage',
+    'locationperiodorjurisdiction': 'http://purl.org/dc/terms/LocationPeriodOrJurisdiction',
+    'created': 'http://purl.org/dc/terms/created',
+    'creator': 'http://purl.org/dc/terms/creator',
+    'dateaccepted': 'http://purl.org/dc/terms/dateAccepted',
+    'datecopyrighted': 'http://purl.org/dc/terms/dateCopyrighted',
+    'datesubmitted': 'http://purl.org/dc/terms/dateSubmitted',
+    'educationlevel': 'http://purl.org/dc/terms/educationLevel',
+    'extent': 'http://purl.org/dc/terms/extent',
+    'format': 'http://purl.org/dc/terms/format',
+    'sizeorduration': 'http://purl.org/dc/terms/SizeOrDuration',
+    'mediatypeorextent': 'http://purl.org/dc/terms/MediaTypeOrExtent',
+    'hasformat': 'http://purl.org/dc/terms/hasFormat',
+    'haspart': 'http://purl.org/dc/terms/hasPart',
+    'hasversion': 'http://purl.org/dc/terms/hasVersion',
+    'instructionalmethod': 'http://purl.org/dc/terms/instructionalMethod',
+    'methodofinstruction': 'http://purl.org/dc/terms/MethodOfInstruction',
+    'isformatof': 'http://purl.org/dc/terms/isFormatOf',
+    'ispartof': 'http://purl.org/dc/terms/isPartOf',
+    'isreferencedby': 'http://purl.org/dc/terms/isReferencedBy',
+    'isreplacedby': 'http://purl.org/dc/terms/isReplacedBy',
+    'isrequiredby': 'http://purl.org/dc/terms/isRequiredBy',
+    'issued': 'http://purl.org/dc/terms/issued',
+    'isversionof': 'http://purl.org/dc/terms/isVersionOf',
+    'language': 'http://purl.org/dc/terms/language',
+    'linguisticsystem': 'http://purl.org/dc/terms/LinguisticSystem',
+    'license': 'http://purl.org/dc/terms/license',
+    'licensedocument': 'http://purl.org/dc/terms/LicenseDocument',
+    'mediator': 'http://purl.org/dc/terms/mediator',
+    'medium': 'http://purl.org/dc/terms/medium',
+    'physicalresource': 'http://purl.org/dc/terms/PhysicalResource',
+    'physicalmedium': 'http://purl.org/dc/terms/PhysicalMedium',
+    'modified': 'http://purl.org/dc/terms/modified',
+    'provenance': 'http://purl.org/dc/terms/provenance',
+    'provenancestatement': 'http://purl.org/dc/terms/ProvenanceStatement',
+    'publisher': 'http://purl.org/dc/terms/publisher',
+    'references': 'http://purl.org/dc/terms/references',
+    'replaces': 'http://purl.org/dc/terms/replaces',
+    'requires': 'http://purl.org/dc/terms/requires',
+    'rightsholder': 'http://purl.org/dc/terms/rightsHolder',
+    'source': 'http://purl.org/dc/terms/source',
+    'spatial': 'http://purl.org/dc/terms/spatial',
+    'location': 'http://purl.org/dc/terms/Location',
+    'subject': 'http://purl.org/dc/terms/subject',
+    'tableofcontents': 'http://purl.org/dc/terms/tableOfContents',
+    'temporal': 'http://purl.org/dc/terms/temporal',
+    'periodoftime': 'http://purl.org/dc/terms/PeriodOfTime',
+    'type': 'http://purl.org/dc/terms/type',
+    'valid': 'http://purl.org/dc/terms/valid',
+}
+
+_URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/']
+
+
+def get_lower_attrib(name):
+    # get attribute to compare against _DC_TERMS or _DC_ELEMENTS
+    return re.sub(r".*\.", "", name).lower()
+
+
+class DublinCoreExtractor(object):
+    """DublinCore extractor following extruct API."""
+
+    def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
+        tree = parse_html(htmlstring, encoding=encoding)
+        return list(self.extract_items(tree, base_url=base_url))
+
+    def extract_items(self, document, base_url=None):
+        elements = []
+        terms = []
+
+        def attrib_to_dict(attribs):
+            # convert _attrib type to dict
+            return dict(attribs.items())
+
+        def populate_results(node, main_attrib):
+            # fill list with DC Elements or DC Terms
+            node_attrib = node.attrib
+            if main_attrib not in node_attrib:
+                return
+
+            name = node.attrib[main_attrib]
+            lower_name = get_lower_attrib(name)
+            if lower_name in _DC_ELEMENTS:
+                node.attrib.update({'URI': _DC_ELEMENTS[lower_name]})
+                elements.append(attrib_to_dict(node.attrib))
+
+            elif lower_name in _DC_TERMS:
+                node.attrib.update({'URI': _DC_TERMS[lower_name]})
+                terms.append(attrib_to_dict(node.attrib))
+
+        namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]')
+        namespaces = {}
+        for i in namespaces_nodes:
+            url = strip_html5_whitespace(i.attrib['href'])
+            if url in _URL_NAMESPACES:
+                namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): url})
+
+        list_meta_node = document.xpath('//meta')
+        for meta_node in list_meta_node:
+            populate_results(meta_node, 'name')
+
+        list_link_node = document.xpath('//link')
+        for link_node in list_link_node:
+            populate_results(link_node, 'rel')
+
+        yield {'namespaces': namespaces, 'elements': elements, 'terms': terms}
diff --git a/extruct/uniform.py b/extruct/uniform.py
@@ -1,4 +1,6 @@
+import copy
 from six.moves.urllib.parse import urlparse, urljoin
+from extruct.dublincore import get_lower_attrib
 
 
 def _uopengraph(extracted, with_og_array=False):
@@ -42,6 +44,23 @@ def _umicrodata_microformat(extracted, schema_context):
     return res
 
 
+def _udublincore(extracted):
+    out = []
+    extracted_cpy = copy.deepcopy(extracted)
+    for obj in extracted_cpy:
+        context = obj.pop('namespaces', None)
+        obj['@context'] = context
+        elements = obj['elements']
+        for element in elements:
+            for key, value in element.items():
+                if get_lower_attrib(value) == 'type':
+                    obj['@type'] = element['content']
+                    obj['elements'].remove(element)
+                    break
+        out.append(obj)
+    return out
+
+
 def _flatten(element, schema_context):
     if isinstance(element, dict):
         element = flatten_dict(element, schema_context, False)