diff --git a/README.rst b/README.rst index f6d92454..5cbc809a 100644 --- a/README.rst +++ b/README.rst @@ -24,6 +24,7 @@ Currently, *extruct* supports: - `Microformat`_ via `mf2py`_ - `Facebook's Open Graph`_ - (experimental) `RDFa`_ via `rdflib`_ +- `Dublin Core Metadata (DC-HTML-2003)`_ .. _W3C's HTML Microdata: http://www.w3.org/TR/microdata/ .. _embedded JSON-LD: http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents @@ -32,6 +33,7 @@ Currently, *extruct* supports: .. _Microformat: http://microformats.org/wiki/Main_Page .. _mf2py: https://github.com/microformats/mf2py .. _Facebook's Open Graph: http://ogp.me/ +.. _Dublin Core Metadata (DC-HTML-2003): https://www.dublincore.org/specifications/dublin-core/dcq-html/2003-11-30/ The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions. @@ -71,7 +73,17 @@ First fetch the HTML using python-requests and then feed the response body to `` >>> data = extruct.extract(r.text, base_url=base_url) >>> >>> pp.pprint(data) - { 'json-ld': [ { '@context': 'https://schema.org', + { 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description', + 'content': 'What is Open Graph Protocol ' + 'and why you need it? Learn to ' + 'implement Open Graph Protocol ' + 'for Facebook on your website. ' + 'Open Graph Protocol Meta Tags.', + 'name': 'description'}], + 'namespaces': {}, + 'terms': []}], + + 'json-ld': [ { '@context': 'https://schema.org', '@id': '#organization', '@type': 'Organization', 'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg', @@ -163,7 +175,7 @@ First fetch the HTML using python-requests and then feed the response body to `` Select syntaxes +++++++++++++++ -It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned:: +It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa' and 'dublincore'. If no list is passed all syntaxes will be extracted and returned:: >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields') >>> base_url = get_base_url(r.text, r.url) @@ -207,9 +219,9 @@ It is possible to select which syntaxes to extract by passing a list with the de Uniform +++++++ -Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: :: +Another option is to uniform the output of microformat, opengraph, microdata, dublincore and json-ld syntaxes to the following structure: :: - {'@context': 'http://example.com', + {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } @@ -584,6 +596,80 @@ Microformat extraction } }] +DublinCore extraction +++++++++++++++++++++++++++++++ +:: + + >>> import pprint + >>> pp = pprint.PrettyPrinter(indent=2) + >>> from extruct.dublincore import DublinCoreExtractor + >>> html = ''' + ... Expressing Dublin Core in HTML/XHTML meta and link elements + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... ''' + >>> dublinlde = DublinCoreExtractor() + >>> data = dublinlde.extract(html) + >>> pp.pprint(data) + [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/title', + 'content': 'Expressing Dublin Core\n' + 'in HTML/XHTML meta and link elements', + 'lang': 'en', + 'name': 'DC.title'}, + { 'URI': 'http://purl.org/dc/elements/1.1/creator', + 'content': 'Andy Powell, UKOLN, University of Bath', + 'name': 'DC.creator'}, + { 'URI': 'http://purl.org/dc/elements/1.1/identifier', + 'content': 'http://dublincore.org/documents/dcq-html/', + 'name': 'DC.identifier', + 'scheme': 'DCTERMS.URI'}, + { 'URI': 'http://purl.org/dc/elements/1.1/format', + 'content': 'text/html', + 'name': 'DC.format', + 'scheme': 'DCTERMS.IMT'}, + { 'URI': 'http://purl.org/dc/elements/1.1/type', + 'content': 'Text', + 'name': 'DC.type', + 'scheme': 'DCTERMS.DCMIType'}], + 'namespaces': { 'DC': 'http://purl.org/dc/elements/1.1/', + 'DCTERMS': 'http://purl.org/dc/terms/'}, + 'terms': [ { 'URI': 'http://purl.org/dc/terms/issued', + 'content': '2003-11-01', + 'name': 'DCTERMS.issued', + 'scheme': 'DCTERMS.W3CDTF'}, + { 'URI': 'http://purl.org/dc/terms/abstract', + 'content': 'This document describes how\n' + 'qualified Dublin Core metadata can be encoded\n' + 'in HTML/XHTML elements', + 'name': 'DCTERMS.abstract'}, + { 'URI': 'http://purl.org/dc/terms/modified', + 'content': '2001-07-18', + 'name': 'DC.Date.modified'}, + { 'URI': 'http://purl.org/dc/terms/modified', + 'content': '2001-07-18', + 'name': 'DCTERMS.modified'}, + { 'URI': 'http://purl.org/dc/terms/replaces', + 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/', + 'hreflang': 'en', + 'rel': 'DCTERMS.replaces'}]}] + + Command Line Tool ----------------- @@ -622,7 +708,7 @@ those, you can pass their individual names collected in a list through 'syntaxes For example, this command extracts only Microdata and JSON-LD metadata from "http://example.com":: - extruct "http://example.com" --syntaxes microdata json-ld + extruct "http://example.com" --syntaxes microdata json-ld NB syntaxes names passed must correspond to these: microdata, json-ld, rdfa, opengraph, microformat diff --git a/extruct/_extruct.py b/extruct/_extruct.py index adbe7320..5bc247f8 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -6,11 +6,12 @@ from extruct.w3cmicrodata import MicrodataExtractor from extruct.opengraph import OpenGraphExtractor from extruct.microformat import MicroformatExtractor -from extruct.uniform import _umicrodata_microformat, _uopengraph +from extruct.dublincore import DublinCoreExtractor +from extruct.uniform import _umicrodata_microformat, _uopengraph, _udublincore from extruct.utils import parse_xmldom_html logger = logging.getLogger(__name__) -SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa'] +SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore'] def extract(htmlstring, @@ -96,6 +97,11 @@ def extract(htmlstring, ('rdfa', RDFaExtractor().extract_items, tree, )) + if 'dublincore' in syntaxes: + processors.append( + ('dublincore', DublinCoreExtractor().extract_items, + tree, + )) output = {} for syntax, extract, document in processors: try: @@ -132,10 +138,20 @@ def extract(htmlstring, output['opengraph'], None, )) + if 'dublincore' in syntaxes: + uniform_processors.append( + ('dublincore', + _udublincore, + output['dublincore'], + None, + )) + for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': output[syntax] = uniform(raw, with_og_array=with_og_array) + elif syntax == 'dublincore': + output[syntax] = uniform(raw) else: output[syntax] = uniform(raw, schema_context) except Exception as e: diff --git a/extruct/dublincore.py b/extruct/dublincore.py new file mode 100644 index 00000000..3e05209e --- /dev/null +++ b/extruct/dublincore.py @@ -0,0 +1,156 @@ +import re + +from extruct.utils import parse_html +from w3lib.html import strip_html5_whitespace + +_DC_ELEMENTS = { # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/ + 'contributor': 'http://purl.org/dc/elements/1.1/contributor', + 'coverage': 'http://purl.org/dc/elements/1.1/coverage', + 'creator': 'http://purl.org/dc/elements/1.1/creator', + 'date': 'http://purl.org/dc/elements/1.1/date', + 'description': 'http://purl.org/dc/elements/1.1/description', + 'format': 'http://purl.org/dc/elements/1.1/format', + 'identifier': 'http://purl.org/dc/elements/1.1/identifier', + 'language': 'http://purl.org/dc/elements/1.1/language', + 'publisher': 'http://purl.org/dc/elements/1.1/publiser', + 'relation': 'http://purl.org/dc/elements/1.1/relation', + 'rights': 'http://purl.org/dc/elements/1.1/rights', + 'source': 'http://purl.org/dc/elements/1.1/source', + 'subject': 'http://purl.org/dc/elements/1.1/subject', + 'title': 'http://purl.org/dc/elements/1.1/title', + 'type': 'http://purl.org/dc/elements/1.1/type', +} + +_DC_TERMS = { # Defined according: http://dublincore.org/documents/2008/01/14/dcmi-terms/ + 'abstract': 'http://purl.org/dc/terms/abstract', + 'description': 'http://purl.org/dc/terms/description', + 'accessrights': 'http://purl.org/dc/terms/accessRights', + 'rights': 'http://purl.org/dc/terms/rights', + 'rightsstatement': 'http://purl.org/dc/terms/RightsStatement', + 'accrualmethod': 'http://purl.org/dc/terms/accrualMethod', + 'collection': 'http://purl.org/dc/terms/Collection', + 'methodOfaccrual': 'http://purl.org/dc/terms/MethodOfAccrual', + 'accrualperiodicity': 'http://purl.org/dc/terms/accrualPeriodicity', + 'frequency': 'http://purl.org/dc/terms/Frequency', + 'accrualpolicy': 'http://purl.org/dc/terms/accrualPolicy', + 'policy': 'http://purl.org/dc/terms/Policy', + 'alternative': 'http://purl.org/dc/terms/alternative', + 'title': 'http://purl.org/dc/terms/title', + 'audience': 'http://purl.org/dc/terms/audience', + 'agentclass': 'http://purl.org/dc/terms/AgentClass', + 'available': 'http://purl.org/dc/terms/available', + 'date': 'http://purl.org/dc/terms/date', + 'bibliographiccitation': 'http://purl.org/dc/terms/bibliographicCitation', + 'identifier': 'http://purl.org/dc/terms/identifier', + 'bibliographicresource': 'http://purl.org/dc/terms/BibliographicResource', + 'conformsto': 'http://purl.org/dc/terms/conformsTo', + 'relation': 'http://purl.org/dc/terms/relation', + 'standard': 'http://purl.org/dc/terms/Standard', + 'contributor': 'http://purl.org/dc/terms/contributor', + 'agent': 'http://purl.org/dc/terms/Agent', + 'coverage': 'http://purl.org/dc/terms/coverage', + 'locationperiodorjurisdiction': 'http://purl.org/dc/terms/LocationPeriodOrJurisdiction', + 'created': 'http://purl.org/dc/terms/created', + 'creator': 'http://purl.org/dc/terms/creator', + 'dateaccepted': 'http://purl.org/dc/terms/dateAccepted', + 'datecopyrighted': 'http://purl.org/dc/terms/dateCopyrighted', + 'datesubmitted': 'http://purl.org/dc/terms/dateSubmitted', + 'educationlevel': 'http://purl.org/dc/terms/educationLevel', + 'extent': 'http://purl.org/dc/terms/extent', + 'format': 'http://purl.org/dc/terms/format', + 'sizeorduration': 'http://purl.org/dc/terms/SizeOrDuration', + 'mediatypeorextent': 'http://purl.org/dc/terms/MediaTypeOrExtent', + 'hasformat': 'http://purl.org/dc/terms/hasFormat', + 'haspart': 'http://purl.org/dc/terms/hasPart', + 'hasversion': 'http://purl.org/dc/terms/hasVersion', + 'instructionalmethod': 'http://purl.org/dc/terms/instructionalMethod', + 'methodofinstruction': 'http://purl.org/dc/terms/MethodOfInstruction', + 'isformatof': 'http://purl.org/dc/terms/isFormatOf', + 'ispartof': 'http://purl.org/dc/terms/isPartOf', + 'isreferencedby': 'http://purl.org/dc/terms/isReferencedBy', + 'isreplacedby': 'http://purl.org/dc/terms/isReplacedBy', + 'isrequiredby': 'http://purl.org/dc/terms/isRequiredBy', + 'issued': 'http://purl.org/dc/terms/issued', + 'isversionof': 'http://purl.org/dc/terms/isVersionOf', + 'language': 'http://purl.org/dc/terms/language', + 'linguisticsystem': 'http://purl.org/dc/terms/LinguisticSystem', + 'license': 'http://purl.org/dc/terms/license', + 'licensedocument': 'http://purl.org/dc/terms/LicenseDocument', + 'mediator': 'http://purl.org/dc/terms/mediator', + 'medium': 'http://purl.org/dc/terms/medium', + 'physicalresource': 'http://purl.org/dc/terms/PhysicalResource', + 'physicalmedium': 'http://purl.org/dc/terms/PhysicalMedium', + 'modified': 'http://purl.org/dc/terms/modified', + 'provenance': 'http://purl.org/dc/terms/provenance', + 'provenancestatement': 'http://purl.org/dc/terms/ProvenanceStatement', + 'publisher': 'http://purl.org/dc/terms/publisher', + 'references': 'http://purl.org/dc/terms/references', + 'replaces': 'http://purl.org/dc/terms/replaces', + 'requires': 'http://purl.org/dc/terms/requires', + 'rightsholder': 'http://purl.org/dc/terms/rightsHolder', + 'source': 'http://purl.org/dc/terms/source', + 'spatial': 'http://purl.org/dc/terms/spatial', + 'location': 'http://purl.org/dc/terms/Location', + 'subject': 'http://purl.org/dc/terms/subject', + 'tableofcontents': 'http://purl.org/dc/terms/tableOfContents', + 'temporal': 'http://purl.org/dc/terms/temporal', + 'periodoftime': 'http://purl.org/dc/terms/PeriodOfTime', + 'type': 'http://purl.org/dc/terms/type', + 'valid': 'http://purl.org/dc/terms/valid', +} + +_URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/'] + + +def get_lower_attrib(name): + # get attribute to compare against _DC_TERMS or _DC_ELEMENTS + return re.sub(r".*\.", "", name).lower() + + +class DublinCoreExtractor(object): + """DublinCore extractor following extruct API.""" + + def extract(self, htmlstring, base_url=None, encoding='UTF-8'): + tree = parse_html(htmlstring, encoding=encoding) + return list(self.extract_items(tree, base_url=base_url)) + + def extract_items(self, document, base_url=None): + elements = [] + terms = [] + + def attrib_to_dict(attribs): + # convert _attrib type to dict + return dict(attribs.items()) + + def populate_results(node, main_attrib): + # fill list with DC Elements or DC Terms + node_attrib = node.attrib + if main_attrib not in node_attrib: + return + + name = node.attrib[main_attrib] + lower_name = get_lower_attrib(name) + if lower_name in _DC_ELEMENTS: + node.attrib.update({'URI': _DC_ELEMENTS[lower_name]}) + elements.append(attrib_to_dict(node.attrib)) + + elif lower_name in _DC_TERMS: + node.attrib.update({'URI': _DC_TERMS[lower_name]}) + terms.append(attrib_to_dict(node.attrib)) + + namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]') + namespaces = {} + for i in namespaces_nodes: + url = strip_html5_whitespace(i.attrib['href']) + if url in _URL_NAMESPACES: + namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): url}) + + list_meta_node = document.xpath('//meta') + for meta_node in list_meta_node: + populate_results(meta_node, 'name') + + list_link_node = document.xpath('//link') + for link_node in list_link_node: + populate_results(link_node, 'rel') + + yield {'namespaces': namespaces, 'elements': elements, 'terms': terms} diff --git a/extruct/uniform.py b/extruct/uniform.py index 1b5de7ed..0fac40ee 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,4 +1,6 @@ +import copy from six.moves.urllib.parse import urlparse, urljoin +from extruct.dublincore import get_lower_attrib def _uopengraph(extracted, with_og_array=False): @@ -42,6 +44,23 @@ def _umicrodata_microformat(extracted, schema_context): return res +def _udublincore(extracted): + out = [] + extracted_cpy = copy.deepcopy(extracted) + for obj in extracted_cpy: + context = obj.pop('namespaces', None) + obj['@context'] = context + elements = obj['elements'] + for element in elements: + for key, value in element.items(): + if get_lower_attrib(value) == 'type': + obj['@type'] = element['content'] + obj['elements'].remove(element) + break + out.append(obj) + return out + + def _flatten(element, schema_context): if isinstance(element, dict): element = flatten_dict(element, schema_context, False) diff --git a/tests/samples/misc/dublincore_test.html b/tests/samples/misc/dublincore_test.html new file mode 100644 index 00000000..44a192ce --- /dev/null +++ b/tests/samples/misc/dublincore_test.html @@ -0,0 +1,21 @@ + +Expressing Dublin Core in HTML/XHTML meta and link elements + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/misc/dublincore_test.json b/tests/samples/misc/dublincore_test.json new file mode 100644 index 00000000..7dbb9e5e --- /dev/null +++ b/tests/samples/misc/dublincore_test.json @@ -0,0 +1,22 @@ +[ + { + "namespaces": { + "DC": "http://purl.org/dc/elements/1.1/", + "DCTERMS": "http://purl.org/dc/terms/" + }, + "elements": [ + {"name": "DC.title", "lang": "en", "content": "Expressing Dublin Core\nin HTML/XHTML meta and link elements", "URI": "http://purl.org/dc/elements/1.1/title"}, + {"name": "DC.creator", "content": "Andy Powell, UKOLN, University of Bath", "URI": "http://purl.org/dc/elements/1.1/creator"}, + {"name": "DC.identifier", "scheme": "DCTERMS.URI", "content": "http://dublincore.org/documents/dcq-html/", "URI": "http://purl.org/dc/elements/1.1/identifier"}, + {"name": "DC.format", "scheme": "DCTERMS.IMT", "content": "text/html", "URI": "http://purl.org/dc/elements/1.1/format"}, + {"name": "DC.type", "scheme": "DCTERMS.DCMIType", "content": "Text", "URI": "http://purl.org/dc/elements/1.1/type"} + ], + "terms": [ + {"name": "DCTERMS.issued", "scheme": "DCTERMS.W3CDTF", "content": "2003-11-01", "URI": "http://purl.org/dc/terms/issued"}, + {"name": "DCTERMS.abstract", "content": "This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML elements", "URI": "http://purl.org/dc/terms/abstract"}, + {"name": "DC.Date.modified", "content": "2001-07-18", "URI": "http://purl.org/dc/terms/modified"}, + {"name": "DCTERMS.modified", "content": "2001-07-18", "URI": "http://purl.org/dc/terms/modified"}, + {"rel": "DCTERMS.replaces", "hreflang": "en", "href": "http://dublincore.org/documents/2000/08/15/dcq-html/", "URI": "http://purl.org/dc/terms/replaces"} + ] + } +] diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 70b51c63..61dae29d 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -268,5 +268,20 @@ } ] } + ], + "dublincore": [ + { + "namespaces": { + }, + "elements": [ + { + "name": "description", + "content": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", + "URI": "http://purl.org/dc/elements/1.1/description" + } + ], + "terms": [ + ] + } ] } diff --git a/tests/samples/songkick/tovestyrke.json b/tests/samples/songkick/tovestyrke.json index 7e17abdd..068df67a 100644 --- a/tests/samples/songkick/tovestyrke.json +++ b/tests/samples/songkick/tovestyrke.json @@ -188,5 +188,19 @@ ] } ], - "microformat": [] + "microformat": [], + "dublincore": [ + { + "namespaces": {}, + "elements": [ + { + "name": "description", + "content": "Past concert. Tove Styrke concert with Geowulf at Hoxton Square Bar & Kitchen in London on 12 Jun 2017.", + "URI": "http://purl.org/dc/elements/1.1/description" + } + ], + "terms": [ + ] + } + ] } \ No newline at end of file diff --git a/tests/test_dublincore.py b/tests/test_dublincore.py new file mode 100644 index 00000000..a11ce603 --- /dev/null +++ b/tests/test_dublincore.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +import json +import unittest + +from extruct.dublincore import DublinCoreExtractor +from tests import get_testdata, jsonize_dict + + +class TestDublincore(unittest.TestCase): + + maxDiff = None + + def test_dublincore(self): + body = get_testdata('misc', 'dublincore_test.html') + expected = json.loads(get_testdata('misc', 'dublincore_test.json').decode('UTF-8')) + + dublincorext = DublinCoreExtractor() + data = dublincorext.extract(body) + self.assertEqual(jsonize_dict(data), expected) diff --git a/tests/test_uniform.py b/tests/test_uniform.py index 7a9f29af..9b7acf7a 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -165,6 +165,47 @@ def test_umicrodata(self): data = extruct.extract(body, syntaxes=['microdata'], uniform=True) self.assertEqual(data['microdata'], expected) + def test_udublincore(self): + expected = [{'elements': [{'name': 'DC.title', + 'lang': 'en', + 'content': 'Expressing Dublin Core\nin HTML/XHTML meta and link elements', + 'URI': 'http://purl.org/dc/elements/1.1/title'}, + {'name': 'DC.creator', + 'content': 'Andy Powell, UKOLN, University of Bath', + 'URI': 'http://purl.org/dc/elements/1.1/creator'}, + {'name': 'DC.identifier', + 'scheme': 'DCTERMS.URI', + 'content': 'http://dublincore.org/documents/dcq-html/', + 'URI': 'http://purl.org/dc/elements/1.1/identifier'}, + {'name': 'DC.format', + 'scheme': 'DCTERMS.IMT', + 'content': 'text/html', + 'URI': 'http://purl.org/dc/elements/1.1/format'}], + 'terms': [{'name': 'DCTERMS.issued', + 'scheme': 'DCTERMS.W3CDTF', + 'content': '2003-11-01', + 'URI': 'http://purl.org/dc/terms/issued'}, + {'name': 'DCTERMS.abstract', + 'content': 'This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML elements', + 'URI': 'http://purl.org/dc/terms/abstract'}, + {'name': 'DC.Date.modified', + 'content': '2001-07-18', + 'URI': 'http://purl.org/dc/terms/modified'}, + {'name': 'DCTERMS.modified', + 'content': '2001-07-18', + 'URI': 'http://purl.org/dc/terms/modified'}, + {'rel': 'DCTERMS.replaces', + 'hreflang': 'en', + 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/', + 'URI': 'http://purl.org/dc/terms/replaces'}], + '@context': {'DC': 'http://purl.org/dc/elements/1.1/', + 'DCTERMS': 'http://purl.org/dc/terms/'}, + '@type': 'Text'}] + body = get_testdata('misc', 'dublincore_test.html') + data = extruct.extract(body, syntaxes=['dublincore'], uniform=True) + self.assertEqual(data['dublincore'], expected) + + def test_infer_context(self): context = 'http://schema.org/UsedCondition'