From ba568a32410ce1f1be98288e3aa6b1f421ae991b Mon Sep 17 00:00:00 2001 From: Joaquin Date: Thu, 13 Dec 2018 17:51:15 -0500 Subject: [PATCH 01/11] Add dublincore schema --- extruct/_extruct.py | 8 +- extruct/dublincore.py | 156 ++++++++++++++++++++++++ tests/samples/misc/dublincore_test.html | 21 ++++ tests/samples/misc/dublincore_test.json | 22 ++++ tests/tests_dublincore.py | 19 +++ 5 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 extruct/dublincore.py create mode 100644 tests/samples/misc/dublincore_test.html create mode 100644 tests/samples/misc/dublincore_test.json create mode 100644 tests/tests_dublincore.py diff --git a/extruct/_extruct.py b/extruct/_extruct.py index ba35a6fa..c6fd0c74 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -6,11 +6,12 @@ from extruct.w3cmicrodata import MicrodataExtractor from extruct.opengraph import OpenGraphExtractor from extruct.microformat import MicroformatExtractor +from extruct.dublincore import DublinCoreExtractor from extruct.uniform import _umicrodata_microformat, _uopengraph from extruct.utils import parse_xmldom_html logger = logging.getLogger(__name__) -SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa'] +SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore'] def extract(htmlstring, @@ -95,6 +96,11 @@ def extract(htmlstring, ('rdfa', RDFaExtractor().extract_items, tree, )) + if 'dublincore' in syntaxes: + processors.append( + ('dublincore', DublinCoreExtractor().extract_items, + tree, + )) output = {} for syntax, extract, document in processors: try: diff --git a/extruct/dublincore.py b/extruct/dublincore.py new file mode 100644 index 00000000..bce9c857 --- /dev/null +++ b/extruct/dublincore.py @@ -0,0 +1,156 @@ +import re + +from extruct.utils import parse_html + +_DC_ELEMENTS = { # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/ + 'contributor': 'http://purl.org/dc/elements/1.1/contributor', + 'coverage': 'http://purl.org/dc/elements/1.1/coverage', + 'creator': 'http://purl.org/dc/elements/1.1/creator', + 'date': 'http://purl.org/dc/elements/1.1/date', + 'description': 'http://purl.org/dc/elements/1.1/description', + 'format': 'http://purl.org/dc/elements/1.1/format', + 'identifier': 'http://purl.org/dc/elements/1.1/identifier', + 'language': 'http://purl.org/dc/elements/1.1/language', + 'publisher': 'http://purl.org/dc/elements/1.1/publiser', + 'relation': 'http://purl.org/dc/elements/1.1/relation', + 'rights': 'http://purl.org/dc/elements/1.1/rights', + 'source': 'http://purl.org/dc/elements/1.1/source', + 'subject': 'http://purl.org/dc/elements/1.1/subject', + 'title': 'http://purl.org/dc/elements/1.1/title', + 'type': 'http://purl.org/dc/elements/1.1/type', +} + +_DC_TERMS = { # Defined according: http://dublincore.org/documents/2008/01/14/dcmi-terms/ + 'abstract': 'http://purl.org/dc/terms/abstract', + 'description': 'http://purl.org/dc/terms/description', + 'accessrights': 'http://purl.org/dc/terms/accessRights', + 'rights': 'http://purl.org/dc/terms/rights', + 'rightsstatement': 'http://purl.org/dc/terms/RightsStatement', + 'accrualmethod': 'http://purl.org/dc/terms/accrualMethod', + 'collection': 'http://purl.org/dc/terms/Collection', + 'methodOfaccrual': 'http://purl.org/dc/terms/MethodOfAccrual', + 'accrualperiodicity': 'http://purl.org/dc/terms/accrualPeriodicity', + 'frequency': 'http://purl.org/dc/terms/Frequency', + 'accrualpolicy': 'http://purl.org/dc/terms/accrualPolicy', + 'policy': 'http://purl.org/dc/terms/Policy', + 'alternative': 'http://purl.org/dc/terms/alternative', + 'title': 'http://purl.org/dc/terms/title', + 'audience': 'http://purl.org/dc/terms/audience', + 'agentclass': 'http://purl.org/dc/terms/AgentClass', + 'available': 'http://purl.org/dc/terms/available', + 'date': 'http://purl.org/dc/terms/date', + 'bibliographiccitation': 'http://purl.org/dc/terms/bibliographicCitation', + 'identifier': 'http://purl.org/dc/terms/identifier', + 'bibliographicresource': 'http://purl.org/dc/terms/BibliographicResource', + 'conformsto': 'http://purl.org/dc/terms/conformsTo', + 'relation': 'http://purl.org/dc/terms/relation', + 'standard': 'http://purl.org/dc/terms/Standard', + 'contributor': 'http://purl.org/dc/terms/contributor', + 'agent': 'http://purl.org/dc/terms/Agent', + 'coverage': 'http://purl.org/dc/terms/coverage', + 'locationperiodorjurisdiction': 'http://purl.org/dc/terms/LocationPeriodOrJurisdiction', + 'created': 'http://purl.org/dc/terms/created', + 'creator': 'http://purl.org/dc/terms/creator', + 'dateaccepted': 'http://purl.org/dc/terms/dateAccepted', + 'datecopyrighted': 'http://purl.org/dc/terms/dateCopyrighted', + 'datesubmitted': 'http://purl.org/dc/terms/dateSubmitted', + 'educationlevel': 'http://purl.org/dc/terms/educationLevel', + 'extent': 'http://purl.org/dc/terms/extent', + 'format': 'http://purl.org/dc/terms/format', + 'sizeorduration': 'http://purl.org/dc/terms/SizeOrDuration', + 'mediatypeorextent': 'http://purl.org/dc/terms/MediaTypeOrExtent', + 'hasformat': 'http://purl.org/dc/terms/hasFormat', + 'haspart': 'http://purl.org/dc/terms/hasPart', + 'hasversion': 'http://purl.org/dc/terms/hasVersion', + 'instructionalmethod': 'http://purl.org/dc/terms/instructionalMethod', + 'methodofinstruction': 'http://purl.org/dc/terms/MethodOfInstruction', + 'isformatof': 'http://purl.org/dc/terms/isFormatOf', + 'ispartof': 'http://purl.org/dc/terms/isPartOf', + 'isreferencedby': 'http://purl.org/dc/terms/isReferencedBy', + 'isreplacedby': 'http://purl.org/dc/terms/isReplacedBy', + 'isrequiredby': 'http://purl.org/dc/terms/isRequiredBy', + 'issued': 'http://purl.org/dc/terms/issued', + 'isversionof': 'http://purl.org/dc/terms/isVersionOf', + 'language': 'http://purl.org/dc/terms/language', + 'linguisticsystem': 'http://purl.org/dc/terms/LinguisticSystem', + 'license': 'http://purl.org/dc/terms/license', + 'licensedocument': 'http://purl.org/dc/terms/LicenseDocument', + 'mediator': 'http://purl.org/dc/terms/mediator', + 'medium': 'http://purl.org/dc/terms/medium', + 'physicalresource': 'http://purl.org/dc/terms/PhysicalResource', + 'physicalmedium': 'http://purl.org/dc/terms/PhysicalMedium', + 'modified': 'http://purl.org/dc/terms/modified', + 'provenance': 'http://purl.org/dc/terms/provenance', + 'provenancestatement': 'http://purl.org/dc/terms/ProvenanceStatement', + 'publisher': 'http://purl.org/dc/terms/publisher', + 'references': 'http://purl.org/dc/terms/references', + 'replaces': 'http://purl.org/dc/terms/replaces', + 'requires': 'http://purl.org/dc/terms/requires', + 'rightsholder': 'http://purl.org/dc/terms/rightsHolder', + 'source': 'http://purl.org/dc/terms/source', + 'spatial': 'http://purl.org/dc/terms/spatial', + 'location': 'http://purl.org/dc/terms/Location', + 'subject': 'http://purl.org/dc/terms/subject', + 'tableofcontents': 'http://purl.org/dc/terms/tableOfContents', + 'temporal': 'http://purl.org/dc/terms/temporal', + 'periodoftime': 'http://purl.org/dc/terms/PeriodOfTime', + 'type': 'http://purl.org/dc/terms/type', + 'valid': 'http://purl.org/dc/terms/valid', +} + +_URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/'] + + +class DublinCoreExtractor(object): + """DublinCore extractor following extruct API.""" + + def extract(self, htmlstring, base_url=None, encoding='UTF-8'): + tree = parse_html(htmlstring, encoding=encoding) + return list(self.extract_items(tree, base_url=base_url)) + + def extract_items(self, document, base_url=None): + elements = [] + terms = [] + + def get_lower_attrib(name): + # get attribute to compare against _DC_TERMS or _DC_ELEMENTS + return re.sub(".*\.", "", name).lower() + + def attrib_to_dict(attribs): + # convert _attrib type to dict + node_dict = {} + for attrib, value in attribs.items(): + node_dict.update({attrib: value}) + return node_dict + + def populate_results(node, main_attrib): + # fill list with DC Elements or DC Terms + node_attrib = node.attrib + if main_attrib not in node_attrib: + return + + name = node.attrib[main_attrib] + lower_name = get_lower_attrib(name) + if lower_name in _DC_ELEMENTS: + node.attrib.update({'URI': _DC_ELEMENTS[lower_name]}) + elements.append(attrib_to_dict(node.attrib)) + + elif lower_name in _DC_TERMS: + node.attrib.update({'URI': _DC_TERMS[lower_name]}) + terms.append(attrib_to_dict(node.attrib)) + + namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]') + namespaces = {} + for i in namespaces_nodes: + if i.attrib['href'] in _URL_NAMESPACES: + namespaces.update({re.sub("schema\.", "", i.attrib['rel']): i.attrib['href']}) + + list_meta_node = document.xpath('//meta') + for meta_node in list_meta_node: + populate_results(meta_node, 'name') + + list_link_node = document.xpath('//link') + for link_node in list_link_node: + populate_results(link_node, 'rel') + + yield {'namespaces': namespaces, 'elements': elements, 'terms': terms} diff --git a/tests/samples/misc/dublincore_test.html b/tests/samples/misc/dublincore_test.html new file mode 100644 index 00000000..44a192ce --- /dev/null +++ b/tests/samples/misc/dublincore_test.html @@ -0,0 +1,21 @@ + +Expressing Dublin Core in HTML/XHTML meta and link elements + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/misc/dublincore_test.json b/tests/samples/misc/dublincore_test.json new file mode 100644 index 00000000..7dbb9e5e --- /dev/null +++ b/tests/samples/misc/dublincore_test.json @@ -0,0 +1,22 @@ +[ + { + "namespaces": { + "DC": "http://purl.org/dc/elements/1.1/", + "DCTERMS": "http://purl.org/dc/terms/" + }, + "elements": [ + {"name": "DC.title", "lang": "en", "content": "Expressing Dublin Core\nin HTML/XHTML meta and link elements", "URI": "http://purl.org/dc/elements/1.1/title"}, + {"name": "DC.creator", "content": "Andy Powell, UKOLN, University of Bath", "URI": "http://purl.org/dc/elements/1.1/creator"}, + {"name": "DC.identifier", "scheme": "DCTERMS.URI", "content": "http://dublincore.org/documents/dcq-html/", "URI": "http://purl.org/dc/elements/1.1/identifier"}, + {"name": "DC.format", "scheme": "DCTERMS.IMT", "content": "text/html", "URI": "http://purl.org/dc/elements/1.1/format"}, + {"name": "DC.type", "scheme": "DCTERMS.DCMIType", "content": "Text", "URI": "http://purl.org/dc/elements/1.1/type"} + ], + "terms": [ + {"name": "DCTERMS.issued", "scheme": "DCTERMS.W3CDTF", "content": "2003-11-01", "URI": "http://purl.org/dc/terms/issued"}, + {"name": "DCTERMS.abstract", "content": "This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML elements", "URI": "http://purl.org/dc/terms/abstract"}, + {"name": "DC.Date.modified", "content": "2001-07-18", "URI": "http://purl.org/dc/terms/modified"}, + {"name": "DCTERMS.modified", "content": "2001-07-18", "URI": "http://purl.org/dc/terms/modified"}, + {"rel": "DCTERMS.replaces", "hreflang": "en", "href": "http://dublincore.org/documents/2000/08/15/dcq-html/", "URI": "http://purl.org/dc/terms/replaces"} + ] + } +] diff --git a/tests/tests_dublincore.py b/tests/tests_dublincore.py new file mode 100644 index 00000000..a11ce603 --- /dev/null +++ b/tests/tests_dublincore.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +import json +import unittest + +from extruct.dublincore import DublinCoreExtractor +from tests import get_testdata, jsonize_dict + + +class TestDublincore(unittest.TestCase): + + maxDiff = None + + def test_dublincore(self): + body = get_testdata('misc', 'dublincore_test.html') + expected = json.loads(get_testdata('misc', 'dublincore_test.json').decode('UTF-8')) + + dublincorext = DublinCoreExtractor() + data = dublincorext.extract(body) + self.assertEqual(jsonize_dict(data), expected) From edc1f64483a90474b6158cae9ad362e88a698c24 Mon Sep 17 00:00:00 2001 From: Joaquin Date: Mon, 17 Dec 2018 23:10:44 -0500 Subject: [PATCH 02/11] Update tests and change to raw strings --- extruct/dublincore.py | 4 ++-- tests/samples/songkick/elysianfields.json | 16 +++++++++++++++- tests/samples/songkick/tovestyrke.json | 15 ++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/extruct/dublincore.py b/extruct/dublincore.py index bce9c857..3c9a358d 100644 --- a/extruct/dublincore.py +++ b/extruct/dublincore.py @@ -114,7 +114,7 @@ def extract_items(self, document, base_url=None): def get_lower_attrib(name): # get attribute to compare against _DC_TERMS or _DC_ELEMENTS - return re.sub(".*\.", "", name).lower() + return re.sub(r".*\.", "", name).lower() def attrib_to_dict(attribs): # convert _attrib type to dict @@ -143,7 +143,7 @@ def populate_results(node, main_attrib): namespaces = {} for i in namespaces_nodes: if i.attrib['href'] in _URL_NAMESPACES: - namespaces.update({re.sub("schema\.", "", i.attrib['rel']): i.attrib['href']}) + namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): i.attrib['href']}) list_meta_node = document.xpath('//meta') for meta_node in list_meta_node: diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 0e67e302..9ad1d6f5 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -261,5 +261,19 @@ } ] } - ] + ], + "dublincore": [ + { + "namespaces": { + }, + "elements": [ + { + "name": "description", + "content": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.", + "URI": "http://purl.org/dc/elements/1.1/description" + } + ], + "terms": [ + + ]}] } \ No newline at end of file diff --git a/tests/samples/songkick/tovestyrke.json b/tests/samples/songkick/tovestyrke.json index 7e17abdd..4e47acce 100644 --- a/tests/samples/songkick/tovestyrke.json +++ b/tests/samples/songkick/tovestyrke.json @@ -188,5 +188,18 @@ ] } ], - "microformat": [] + "microformat": [], + "dublincore": [ + { + "namespaces": { + + }, + "elements": [ + { + "name": "description", + "content": "Past concert. Tove Styrke concert with Geowulf at Hoxton Square Bar & Kitchen in London on 12 Jun 2017.", + "URI": "http://purl.org/dc/elements/1.1/description" + }], + "terms": [ + ]}] } \ No newline at end of file From cd01c5ff27e76f37df6ee295b3874557984bfaa3 Mon Sep 17 00:00:00 2001 From: Joaquin Date: Mon, 17 Dec 2018 23:42:04 -0500 Subject: [PATCH 03/11] Fix file typo --- tests/{tests_dublincore.py => test_dublincore.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{tests_dublincore.py => test_dublincore.py} (100%) diff --git a/tests/tests_dublincore.py b/tests/test_dublincore.py similarity index 100% rename from tests/tests_dublincore.py rename to tests/test_dublincore.py From 3d4bf5d6258bf3d3ecec45c3646585772881b6aa Mon Sep 17 00:00:00 2001 From: Joaquin Date: Sun, 23 Dec 2018 02:15:09 -0500 Subject: [PATCH 04/11] Add uniform option --- extruct/_extruct.py | 12 ++++++++++-- extruct/dublincore.py | 9 +++++---- extruct/uniform.py | 17 +++++++++++++++++ tests/test_uniform.py | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 6 deletions(-) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index c6fd0c74..81a951f6 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -7,7 +7,7 @@ from extruct.opengraph import OpenGraphExtractor from extruct.microformat import MicroformatExtractor from extruct.dublincore import DublinCoreExtractor -from extruct.uniform import _umicrodata_microformat, _uopengraph +from extruct.uniform import _umicrodata_microformat, _uopengraph, _udublincore from extruct.utils import parse_xmldom_html logger = logging.getLogger(__name__) @@ -137,9 +137,17 @@ def extract(htmlstring, output['opengraph'], None, )) + if 'dublincore' in syntaxes: + uniform_processors.append( + ('dublincore', + _udublincore, + output['dublincore'], + None, + )) + for syntax, uniform, raw, schema_context in uniform_processors: try: - if syntax == 'opengraph': + if syntax in ['opengraph', 'dublincore']: output[syntax] = uniform(raw) else: output[syntax] = uniform(raw, schema_context) diff --git a/extruct/dublincore.py b/extruct/dublincore.py index 3c9a358d..14261e75 100644 --- a/extruct/dublincore.py +++ b/extruct/dublincore.py @@ -101,6 +101,11 @@ _URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/'] +def get_lower_attrib(name): + # get attribute to compare against _DC_TERMS or _DC_ELEMENTS + return re.sub(r".*\.", "", name).lower() + + class DublinCoreExtractor(object): """DublinCore extractor following extruct API.""" @@ -112,10 +117,6 @@ def extract_items(self, document, base_url=None): elements = [] terms = [] - def get_lower_attrib(name): - # get attribute to compare against _DC_TERMS or _DC_ELEMENTS - return re.sub(r".*\.", "", name).lower() - def attrib_to_dict(attribs): # convert _attrib type to dict node_dict = {} diff --git a/extruct/uniform.py b/extruct/uniform.py index 9a530b53..824f03ba 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,4 +1,5 @@ from six.moves.urllib.parse import urlparse, urljoin +from extruct.dublincore import get_lower_attrib def _uopengraph(extracted): @@ -23,6 +24,22 @@ def _umicrodata_microformat(extracted, schema_context): return res +def _udublincore(extracted): + out = [] + for obj in extracted: + context = obj.pop('namespaces', None) + obj['@context'] = context + elements = obj['elements'] + for element in elements: + for key, value in element.items(): + if get_lower_attrib(value) == 'type': + obj['@type'] = element['content'] + elements.remove(element) + break + out.append(obj) + return out + + def _flatten(element, schema_context): if isinstance(element, dict): element = flatten_dict(element, schema_context, False) diff --git a/tests/test_uniform.py b/tests/test_uniform.py index db178f51..185ca4b0 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -84,6 +84,47 @@ def test_umicrodata(self): data = extruct.extract(body, syntaxes=['microdata'], uniform=True) self.assertEqual(data['microdata'], expected) + def test_udublincore(self): + expected = [{'elements': [{'name': 'DC.title', + 'lang': 'en', + 'content': 'Expressing Dublin Core\nin HTML/XHTML meta and link elements', + 'URI': 'http://purl.org/dc/elements/1.1/title'}, + {'name': 'DC.creator', + 'content': 'Andy Powell, UKOLN, University of Bath', + 'URI': 'http://purl.org/dc/elements/1.1/creator'}, + {'name': 'DC.identifier', + 'scheme': 'DCTERMS.URI', + 'content': 'http://dublincore.org/documents/dcq-html/', + 'URI': 'http://purl.org/dc/elements/1.1/identifier'}, + {'name': 'DC.format', + 'scheme': 'DCTERMS.IMT', + 'content': 'text/html', + 'URI': 'http://purl.org/dc/elements/1.1/format'}], + 'terms': [{'name': 'DCTERMS.issued', + 'scheme': 'DCTERMS.W3CDTF', + 'content': '2003-11-01', + 'URI': 'http://purl.org/dc/terms/issued'}, + {'name': 'DCTERMS.abstract', + 'content': 'This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML elements', + 'URI': 'http://purl.org/dc/terms/abstract'}, + {'name': 'DC.Date.modified', + 'content': '2001-07-18', + 'URI': 'http://purl.org/dc/terms/modified'}, + {'name': 'DCTERMS.modified', + 'content': '2001-07-18', + 'URI': 'http://purl.org/dc/terms/modified'}, + {'rel': 'DCTERMS.replaces', + 'hreflang': 'en', + 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/', + 'URI': 'http://purl.org/dc/terms/replaces'}], + '@context': {'DC': 'http://purl.org/dc/elements/1.1/', + 'DCTERMS': 'http://purl.org/dc/terms/'}, + '@type': 'Text'}] + body = get_testdata('misc', 'dublincore_test.html') + data = extruct.extract(body, syntaxes=['dublincore'], uniform=True) + self.assertEqual(data['dublincore'], expected) + + def test_infer_context(self): context = 'http://schema.org/UsedCondition' From 031427f0feaca4fb39875211e266951a5ef2ecbb Mon Sep 17 00:00:00 2001 From: Joaquin Garmendia Cabrera Date: Mon, 14 Jan 2019 16:04:48 -0500 Subject: [PATCH 05/11] Update Readme with DublinCore Options --- README.rst | 303 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 201 insertions(+), 102 deletions(-) diff --git a/README.rst b/README.rst index 1c71c690..9f7ec438 100644 --- a/README.rst +++ b/README.rst @@ -26,6 +26,7 @@ Currently, *extruct* supports: - `Microformat`_ via `mf2py`_ - `Facebook's Open Graph`_ - (experimental) `RDFa`_ via `rdflib`_ +- `Dublin Core Metadata`_ .. _W3C's HTML Microdata: http://www.w3.org/TR/microdata/ .. _embedded JSON-LD: http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents @@ -34,6 +35,7 @@ Currently, *extruct* supports: .. _Microformat: http://microformats.org/wiki/Main_Page .. _mf2py: https://github.com/microformats/mf2py .. _Facebook's Open Graph: http://ogp.me/ +.. _Dublin Core Metadata: http://dublincore.org/documents/dcq-html/ The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions. @@ -62,110 +64,132 @@ Let's try this on a webpage that uses all the syntaxes supported (RDFa with `ogp First fetch the HTML using python-requests and then feed the response body to ``extruct``:: - >>> import extruct - >>> import requests - >>> import pprint - >>> from w3lib.html import get_base_url - >>> - >>> pp = pprint.PrettyPrinter(indent=2) - >>> r = requests.get('https://www.optimizesmart.com/how-to-use-open-graph-protocol/') - >>> base_url = get_base_url(r.text, r.url) - >>> data = extruct.extract(r.text, base_url=base_url) - >>> - >>> pp.pprint(data) - { 'json-ld': [ { '@context': 'https://schema.org', - '@id': '#organization', - '@type': 'Organization', - 'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg', - 'name': 'Optimize Smart', - 'sameAs': [ 'https://www.facebook.com/optimizesmart/', - 'https://uk.linkedin.com/in/analyticsnerd', - 'https://www.youtube.com/user/optimizesmart', - 'https://twitter.com/analyticsnerd'], - 'url': 'https://www.optimizesmart.com/'}], - 'microdata': [ { 'properties': {'headline': ''}, - 'type': 'http://schema.org/WPHeader'}], - 'microformat': [ { 'children': [ { 'properties': { 'category': [ 'specialized-tracking'], - 'name': [ 'Open Graph ' - 'Protocol for ' - 'Facebook ' - 'explained with ' - 'examples\n' - '\n' - 'Specialized ' - 'Tracking\n' - '\n' - '\n' - (...) - 'Follow ' - '@analyticsnerd\n' - '!function(d,s,id){var ' - "js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, " - "'script', " - "'twitter-wjs');"]}, - 'type': ['h-entry']}], - 'properties': { 'name': [ 'Open Graph Protocol for ' - 'Facebook explained with ' - 'examples\n' - (...) - 'Follow @analyticsnerd\n' - '!function(d,s,id){var ' - "js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, " - "'script', 'twitter-wjs');"]}, - 'type': ['h-feed']}], - 'opengraph': [ { 'namespace': {'og': 'http://ogp.me/ns#'}, - 'properties': [ ('og:locale', 'en_US'), - ('og:type', 'article'), - ( 'og:title', - 'Open Graph Protocol for Facebook ' - 'explained with examples'), - ( 'og:description', - 'What is Open Graph Protocol and why you ' - 'need it? Learn to implement Open Graph ' - 'Protocol for Facebook on your website. ' - 'Open Graph Protocol Meta Tags.'), - ( 'og:url', - 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'), - ('og:site_name', 'Optimize Smart'), - ( 'og:updated_time', - '2018-03-09T16:26:35+00:00'), - ( 'og:image', - 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'), - ( 'og:image:secure_url', - 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg')]}], - 'rdfa': [ { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/#header', - 'http://www.w3.org/1999/xhtml/vocab#role': [ { '@id': 'http://www.w3.org/1999/xhtml/vocab#banner'}]}, - { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/', - 'article:modified_time': [ { '@value': '2018-03-09T16:26:35+00:00'}], - 'article:published_time': [ { '@value': '2010-07-02T18:57:23+00:00'}], - 'article:publisher': [ { '@value': 'https://www.facebook.com/optimizesmart/'}], - 'article:section': [{'@value': 'Specialized Tracking'}], - 'http://ogp.me/ns#description': [ { '@value': 'What is Open ' - 'Graph Protocol ' - 'and why you need ' - 'it? Learn to ' - 'implement Open ' - 'Graph Protocol ' - 'for Facebook on ' - 'your website. ' - 'Open Graph ' - 'Protocol Meta ' - 'Tags.'}], - 'http://ogp.me/ns#image': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], - 'http://ogp.me/ns#image:secure_url': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], - 'http://ogp.me/ns#locale': [{'@value': 'en_US'}], - 'http://ogp.me/ns#site_name': [{'@value': 'Optimize Smart'}], - 'http://ogp.me/ns#title': [ { '@value': 'Open Graph Protocol for ' - 'Facebook explained with ' - 'examples'}], - 'http://ogp.me/ns#type': [{'@value': 'article'}], - 'http://ogp.me/ns#updated_time': [ { '@value': '2018-03-09T16:26:35+00:00'}], - 'http://ogp.me/ns#url': [ { '@value': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'}], - 'https://api.w.org/': [ { '@id': 'https://www.optimizesmart.com/wp-json/'}]}]} + >>> import extruct + >>> import requests + >>> import pprint + >>> from w3lib.html import get_base_url + >>> pp = pprint.PrettyPrinter(indent=2) + >>> r = requests.get('https://www.optimizesmart.com/how-to-use-open-graph-protocol/') + >>> base_url = get_base_url(r.text, r.url) + >>> data = extruct.extract(r.text, base_url=base_url) + >>> pp.pprint(data) + { 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description', + 'content': 'What is Open Graph Protocol ' + 'and why you need it? Learn to ' + 'implement Open Graph Protocol ' + 'for Facebook on your website. ' + 'Open Graph Protocol Meta Tags.', + 'name': 'description'}], + 'namespaces': {}, + 'terms': []}], + 'json-ld': [ { '@context': 'https://schema.org', + '@id': 'https://www.optimizesmart.com/#organization', + '@type': 'Organization', + 'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg', + 'name': 'Optimize Smart', + 'sameAs': [ 'https://www.facebook.com/optimizesmart/', + 'https://uk.linkedin.com/in/analyticsnerd', + 'https://www.youtube.com/user/optimizesmart', + 'https://twitter.com/analyticsnerd'], + 'url': 'https://www.optimizesmart.com/'}, + { '@context': 'http://schema.org', + '@id': '', + '@type': 'ProfessionalService', + 'address': { '@type': 'PostalAddress', + 'addressCountry': 'GB', + 'addressLocality': 'Southampton', + 'postalCode': '', + 'streetAddress': ''}, + 'image': 'https://www.optimizesmart.com/wp-content/themes/Sept17OptimizeSmartDEV/images/logo-small.png', + 'name': 'Optimize Smart', + 'openingHoursSpecification': { '@type': 'OpeningHoursSpecification', + 'closes': '23:59', + 'dayOfWeek': [ 'Monday', + 'Tuesday', + 'Wednesday', + 'Thursday', + 'Friday', + 'Saturday', + 'Sunday'], + 'opens': '00:00'}, + 'sameAs': [ 'https://www.facebook.com/optimizesmart/', + 'https://twitter.com/OptimizeSmart', + 'https://www.youtube.com/user/optimizesmart', + 'https://www.linkedin.com/in/analyticsnerd/'], + 'telephone': '', + 'url': 'https://www.optimizesmart.com'}], + 'microdata': [ { 'properties': {'headline': ''}, + 'type': 'http://schema.org/WPHeader'}], + 'microformat': [ { 'children': [ { 'properties': { 'category': [ 'facebook-tracking', + 'specialized-tracking']}, + 'type': ['h-entry']}], + 'properties': {}, + 'type': ['h-feed']}], + 'opengraph': [ { 'namespace': { 'article': 'http://ogp.me/ns/article#', + 'og': 'http://ogp.me/ns#'}, + 'properties': [ ('og:locale', 'en_US'), + ('og:type', 'article'), + ( 'og:title', + 'Open Graph Protocol for Facebook ' + 'explained with examples'), + ( 'og:description', + 'What is Open Graph Protocol and why you ' + 'need it? Learn to implement Open Graph ' + 'Protocol for Facebook on your website. ' + 'Open Graph Protocol Meta Tags.'), + ( 'og:url', + 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'), + ('og:site_name', 'Optimize Smart'), + ( 'article:publisher', + 'https://www.facebook.com/optimizesmart/'), + ('article:section', 'Facebook Tracking'), + ( 'article:published_time', + '2017-02-02T18:57:23+00:00'), + ( 'article:modified_time', + '2019-01-11T10:49:01+00:00'), + ( 'og:updated_time', + '2019-01-11T10:49:01+00:00'), + ( 'og:image', + 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'), + ( 'og:image:secure_url', + 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'), + ('og:image:width', '711'), + ('og:image:height', '309')]}], + 'rdfa': [ { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/#header', + 'http://www.w3.org/1999/xhtml/vocab#role': [ { '@id': 'http://www.w3.org/1999/xhtml/vocab#banner'}]}, + { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/', + 'article:modified_time': [ { '@value': '2019-01-11T10:49:01+00:00'}], + 'article:published_time': [ { '@value': '2017-02-02T18:57:23+00:00'}], + 'article:publisher': [ { '@value': 'https://www.facebook.com/optimizesmart/'}], + 'article:section': [{'@value': 'Facebook Tracking'}], + 'http://ogp.me/ns#description': [ { '@value': 'What is Open ' + 'Graph Protocol ' + 'and why you need ' + 'it? Learn to ' + 'implement Open ' + 'Graph Protocol ' + 'for Facebook on ' + 'your website. ' + 'Open Graph ' + 'Protocol Meta ' + 'Tags.'}], + 'http://ogp.me/ns#image': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], + 'http://ogp.me/ns#image:height': [{'@value': '309'}], + 'http://ogp.me/ns#image:secure_url': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], + 'http://ogp.me/ns#image:width': [{'@value': '711'}], + 'http://ogp.me/ns#locale': [{'@value': 'en_US'}], + 'http://ogp.me/ns#site_name': [{'@value': 'Optimize Smart'}], + 'http://ogp.me/ns#title': [ { '@value': 'Open Graph Protocol for ' + 'Facebook explained with ' + 'examples'}], + 'http://ogp.me/ns#type': [{'@value': 'article'}], + 'http://ogp.me/ns#updated_time': [ { '@value': '2019-01-11T10:49:01+00:00'}], + 'http://ogp.me/ns#url': [ { '@value': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'}], + 'https://api.w.org/': [ { '@id': 'https://www.optimizesmart.com/wp-json/'}]}]} Select syntaxes +++++++++++++++ -It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned:: +It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa', 'dublincore'. If no list is passed all syntaxes will be extracted and returned:: >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields') >>> base_url = get_base_url(r.text, r.url) @@ -209,7 +233,7 @@ It is possible to select which syntaxes to extract by passing a list with the de Uniform +++++++ -Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: :: +Another option is to uniform the output of microformat, opengraph, microdata, dublincore and json-ld syntaxes to the following structure: :: {'@context': 'http://example.com', '@type': 'example_type', @@ -585,6 +609,81 @@ Microformat extraction ] } }] + +DublinCore extraction +++++++++++++++++++++++++++++++ +:: + + >>> import pprint + >>> pp = pprint.PrettyPrinter(indent=2) + >>> from extruct.dublincore import DublinCoreExtractor + >>> html = ''' + ... Expressing Dublin Core in HTML/XHTML meta and link elements + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... ''' + >>> dublinlde = DublinCoreExtractor() + >>> data = dublinlde.extract(html) + >>> pp.pprint(data) + [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/title', + 'content': 'Expressing Dublin Core\n' + 'in HTML/XHTML meta and link elements', + 'lang': 'en', + 'name': 'DC.title'}, + { 'URI': 'http://purl.org/dc/elements/1.1/creator', + 'content': 'Andy Powell, UKOLN, University of Bath', + 'name': 'DC.creator'}, + { 'URI': 'http://purl.org/dc/elements/1.1/identifier', + 'content': 'http://dublincore.org/documents/dcq-html/', + 'name': 'DC.identifier', + 'scheme': 'DCTERMS.URI'}, + { 'URI': 'http://purl.org/dc/elements/1.1/format', + 'content': 'text/html', + 'name': 'DC.format', + 'scheme': 'DCTERMS.IMT'}, + { 'URI': 'http://purl.org/dc/elements/1.1/type', + 'content': 'Text', + 'name': 'DC.type', + 'scheme': 'DCTERMS.DCMIType'}], + 'namespaces': { 'DC': 'http://purl.org/dc/elements/1.1/', + 'DCTERMS': 'http://purl.org/dc/terms/'}, + 'terms': [ { 'URI': 'http://purl.org/dc/terms/issued', + 'content': '2003-11-01', + 'name': 'DCTERMS.issued', + 'scheme': 'DCTERMS.W3CDTF'}, + { 'URI': 'http://purl.org/dc/terms/abstract', + 'content': 'This document describes how\n' + 'qualified Dublin Core metadata can be encoded\n' + 'in HTML/XHTML elements', + 'name': 'DCTERMS.abstract'}, + { 'URI': 'http://purl.org/dc/terms/modified', + 'content': '2001-07-18', + 'name': 'DC.Date.modified'}, + { 'URI': 'http://purl.org/dc/terms/modified', + 'content': '2001-07-18', + 'name': 'DCTERMS.modified'}, + { 'URI': 'http://purl.org/dc/terms/replaces', + 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/', + 'hreflang': 'en', + 'rel': 'DCTERMS.replaces'}]}] + + REST API service ---------------- From 8cc838509351ec8fb0d5581bd3ec97557d6523f4 Mon Sep 17 00:00:00 2001 From: Joaquin Date: Mon, 14 Jan 2019 19:23:05 -0500 Subject: [PATCH 06/11] Fix list iteration --- extruct/uniform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extruct/uniform.py b/extruct/uniform.py index 824f03ba..aefbcc7b 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -30,11 +30,11 @@ def _udublincore(extracted): context = obj.pop('namespaces', None) obj['@context'] = context elements = obj['elements'] - for element in elements: + for element in list(elements): for key, value in element.items(): if get_lower_attrib(value) == 'type': obj['@type'] = element['content'] - elements.remove(element) + obj['elements'].remove(element) break out.append(obj) return out From ac2bdfce89a8fdff98d809570d76018f40b8b3ba Mon Sep 17 00:00:00 2001 From: Joaquin Date: Thu, 17 Jan 2019 16:55:47 -0500 Subject: [PATCH 07/11] Make requested changes --- extruct/dublincore.py | 10 ++++------ extruct/uniform.py | 2 +- tests/samples/songkick/elysianfields.json | 9 +++++---- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/extruct/dublincore.py b/extruct/dublincore.py index 14261e75..fed54ab4 100644 --- a/extruct/dublincore.py +++ b/extruct/dublincore.py @@ -1,6 +1,7 @@ import re from extruct.utils import parse_html +from w3lib.html import strip_html5_whitespace _DC_ELEMENTS = { # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/ 'contributor': 'http://purl.org/dc/elements/1.1/contributor', @@ -119,10 +120,7 @@ def extract_items(self, document, base_url=None): def attrib_to_dict(attribs): # convert _attrib type to dict - node_dict = {} - for attrib, value in attribs.items(): - node_dict.update({attrib: value}) - return node_dict + return dict(attribs.items()) def populate_results(node, main_attrib): # fill list with DC Elements or DC Terms @@ -143,8 +141,8 @@ def populate_results(node, main_attrib): namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]') namespaces = {} for i in namespaces_nodes: - if i.attrib['href'] in _URL_NAMESPACES: - namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): i.attrib['href']}) + if strip_html5_whitespace(i.attrib['href']) in _URL_NAMESPACES: + namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): strip_html5_whitespace(i.attrib['href'])}) list_meta_node = document.xpath('//meta') for meta_node in list_meta_node: diff --git a/extruct/uniform.py b/extruct/uniform.py index aefbcc7b..cf97717c 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -26,7 +26,7 @@ def _umicrodata_microformat(extracted, schema_context): def _udublincore(extracted): out = [] - for obj in extracted: + for obj in list(extracted): context = obj.pop('namespaces', None) obj['@context'] = context elements = obj['elements'] diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 9ad1d6f5..f7c10169 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -265,7 +265,7 @@ "dublincore": [ { "namespaces": { - }, + }, "elements": [ { "name": "description", @@ -273,7 +273,8 @@ "URI": "http://purl.org/dc/elements/1.1/description" } ], - "terms": [ - - ]}] + "terms": [ + ] + } + ] } \ No newline at end of file From 32e416b80dbbf248dc5fb568bb282b37ec4876bb Mon Sep 17 00:00:00 2001 From: Joaquin Date: Fri, 18 Jan 2019 10:10:46 -0500 Subject: [PATCH 08/11] Add local variable to improve legibility --- extruct/dublincore.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extruct/dublincore.py b/extruct/dublincore.py index fed54ab4..3e05209e 100644 --- a/extruct/dublincore.py +++ b/extruct/dublincore.py @@ -141,8 +141,9 @@ def populate_results(node, main_attrib): namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]') namespaces = {} for i in namespaces_nodes: - if strip_html5_whitespace(i.attrib['href']) in _URL_NAMESPACES: - namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): strip_html5_whitespace(i.attrib['href'])}) + url = strip_html5_whitespace(i.attrib['href']) + if url in _URL_NAMESPACES: + namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): url}) list_meta_node = document.xpath('//meta') for meta_node in list_meta_node: From bd1448fd917737f79cc39f707c725fe0d2988e67 Mon Sep 17 00:00:00 2001 From: Joaquin Date: Sun, 20 Sep 2020 17:19:13 -0500 Subject: [PATCH 09/11] Change shallow cpy to deep cpy, update extruct, readme. --- README.rst | 344 ++++++++++++++------------------------------ extruct/_extruct.py | 4 +- extruct/uniform.py | 6 +- 3 files changed, 116 insertions(+), 238 deletions(-) diff --git a/README.rst b/README.rst index 9f7ec438..1cfaca65 100644 --- a/README.rst +++ b/README.rst @@ -17,8 +17,6 @@ extruct *extruct* is a library for extracting embedded metadata from HTML markup. -It also has a built-in HTTP server to test its output as JSON. - Currently, *extruct* supports: - `W3C's HTML Microdata`_ @@ -64,16 +62,18 @@ Let's try this on a webpage that uses all the syntaxes supported (RDFa with `ogp First fetch the HTML using python-requests and then feed the response body to ``extruct``:: - >>> import extruct - >>> import requests - >>> import pprint - >>> from w3lib.html import get_base_url - >>> pp = pprint.PrettyPrinter(indent=2) - >>> r = requests.get('https://www.optimizesmart.com/how-to-use-open-graph-protocol/') - >>> base_url = get_base_url(r.text, r.url) - >>> data = extruct.extract(r.text, base_url=base_url) - >>> pp.pprint(data) - { 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description', + >>> import extruct + >>> import requests + >>> import pprint + >>> from w3lib.html import get_base_url + >>> + >>> pp = pprint.PrettyPrinter(indent=2) + >>> r = requests.get('https://www.optimizesmart.com/how-to-use-open-graph-protocol/') + >>> base_url = get_base_url(r.text, r.url) + >>> data = extruct.extract(r.text, base_url=base_url) + >>> + >>> pp.pprint(data) + { 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description', 'content': 'What is Open Graph Protocol ' 'and why you need it? Learn to ' 'implement Open Graph Protocol ' @@ -82,114 +82,100 @@ First fetch the HTML using python-requests and then feed the response body to `` 'name': 'description'}], 'namespaces': {}, 'terms': []}], - 'json-ld': [ { '@context': 'https://schema.org', - '@id': 'https://www.optimizesmart.com/#organization', - '@type': 'Organization', - 'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg', - 'name': 'Optimize Smart', - 'sameAs': [ 'https://www.facebook.com/optimizesmart/', - 'https://uk.linkedin.com/in/analyticsnerd', - 'https://www.youtube.com/user/optimizesmart', - 'https://twitter.com/analyticsnerd'], - 'url': 'https://www.optimizesmart.com/'}, - { '@context': 'http://schema.org', - '@id': '', - '@type': 'ProfessionalService', - 'address': { '@type': 'PostalAddress', - 'addressCountry': 'GB', - 'addressLocality': 'Southampton', - 'postalCode': '', - 'streetAddress': ''}, - 'image': 'https://www.optimizesmart.com/wp-content/themes/Sept17OptimizeSmartDEV/images/logo-small.png', - 'name': 'Optimize Smart', - 'openingHoursSpecification': { '@type': 'OpeningHoursSpecification', - 'closes': '23:59', - 'dayOfWeek': [ 'Monday', - 'Tuesday', - 'Wednesday', - 'Thursday', - 'Friday', - 'Saturday', - 'Sunday'], - 'opens': '00:00'}, - 'sameAs': [ 'https://www.facebook.com/optimizesmart/', - 'https://twitter.com/OptimizeSmart', - 'https://www.youtube.com/user/optimizesmart', - 'https://www.linkedin.com/in/analyticsnerd/'], - 'telephone': '', - 'url': 'https://www.optimizesmart.com'}], - 'microdata': [ { 'properties': {'headline': ''}, - 'type': 'http://schema.org/WPHeader'}], - 'microformat': [ { 'children': [ { 'properties': { 'category': [ 'facebook-tracking', - 'specialized-tracking']}, - 'type': ['h-entry']}], - 'properties': {}, - 'type': ['h-feed']}], - 'opengraph': [ { 'namespace': { 'article': 'http://ogp.me/ns/article#', - 'og': 'http://ogp.me/ns#'}, - 'properties': [ ('og:locale', 'en_US'), - ('og:type', 'article'), - ( 'og:title', - 'Open Graph Protocol for Facebook ' - 'explained with examples'), - ( 'og:description', - 'What is Open Graph Protocol and why you ' - 'need it? Learn to implement Open Graph ' - 'Protocol for Facebook on your website. ' - 'Open Graph Protocol Meta Tags.'), - ( 'og:url', - 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'), - ('og:site_name', 'Optimize Smart'), - ( 'article:publisher', - 'https://www.facebook.com/optimizesmart/'), - ('article:section', 'Facebook Tracking'), - ( 'article:published_time', - '2017-02-02T18:57:23+00:00'), - ( 'article:modified_time', - '2019-01-11T10:49:01+00:00'), - ( 'og:updated_time', - '2019-01-11T10:49:01+00:00'), - ( 'og:image', - 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'), - ( 'og:image:secure_url', - 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'), - ('og:image:width', '711'), - ('og:image:height', '309')]}], - 'rdfa': [ { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/#header', - 'http://www.w3.org/1999/xhtml/vocab#role': [ { '@id': 'http://www.w3.org/1999/xhtml/vocab#banner'}]}, - { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/', - 'article:modified_time': [ { '@value': '2019-01-11T10:49:01+00:00'}], - 'article:published_time': [ { '@value': '2017-02-02T18:57:23+00:00'}], - 'article:publisher': [ { '@value': 'https://www.facebook.com/optimizesmart/'}], - 'article:section': [{'@value': 'Facebook Tracking'}], - 'http://ogp.me/ns#description': [ { '@value': 'What is Open ' - 'Graph Protocol ' - 'and why you need ' - 'it? Learn to ' - 'implement Open ' - 'Graph Protocol ' - 'for Facebook on ' - 'your website. ' - 'Open Graph ' - 'Protocol Meta ' - 'Tags.'}], - 'http://ogp.me/ns#image': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], - 'http://ogp.me/ns#image:height': [{'@value': '309'}], - 'http://ogp.me/ns#image:secure_url': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], - 'http://ogp.me/ns#image:width': [{'@value': '711'}], - 'http://ogp.me/ns#locale': [{'@value': 'en_US'}], - 'http://ogp.me/ns#site_name': [{'@value': 'Optimize Smart'}], - 'http://ogp.me/ns#title': [ { '@value': 'Open Graph Protocol for ' - 'Facebook explained with ' - 'examples'}], - 'http://ogp.me/ns#type': [{'@value': 'article'}], - 'http://ogp.me/ns#updated_time': [ { '@value': '2019-01-11T10:49:01+00:00'}], - 'http://ogp.me/ns#url': [ { '@value': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'}], - 'https://api.w.org/': [ { '@id': 'https://www.optimizesmart.com/wp-json/'}]}]} + + 'json-ld': [ { '@context': 'https://schema.org', + '@id': '#organization', + '@type': 'Organization', + 'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg', + 'name': 'Optimize Smart', + 'sameAs': [ 'https://www.facebook.com/optimizesmart/', + 'https://uk.linkedin.com/in/analyticsnerd', + 'https://www.youtube.com/user/optimizesmart', + 'https://twitter.com/analyticsnerd'], + 'url': 'https://www.optimizesmart.com/'}], + 'microdata': [ { 'properties': {'headline': ''}, + 'type': 'http://schema.org/WPHeader'}], + 'microformat': [ { 'children': [ { 'properties': { 'category': [ 'specialized-tracking'], + 'name': [ 'Open Graph ' + 'Protocol for ' + 'Facebook ' + 'explained with ' + 'examples\n' + '\n' + 'Specialized ' + 'Tracking\n' + '\n' + '\n' + (...) + 'Follow ' + '@analyticsnerd\n' + '!function(d,s,id){var ' + "js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, " + "'script', " + "'twitter-wjs');"]}, + 'type': ['h-entry']}], + 'properties': { 'name': [ 'Open Graph Protocol for ' + 'Facebook explained with ' + 'examples\n' + (...) + 'Follow @analyticsnerd\n' + '!function(d,s,id){var ' + "js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, " + "'script', 'twitter-wjs');"]}, + 'type': ['h-feed']}], + 'opengraph': [ { 'namespace': {'og': 'http://ogp.me/ns#'}, + 'properties': [ ('og:locale', 'en_US'), + ('og:type', 'article'), + ( 'og:title', + 'Open Graph Protocol for Facebook ' + 'explained with examples'), + ( 'og:description', + 'What is Open Graph Protocol and why you ' + 'need it? Learn to implement Open Graph ' + 'Protocol for Facebook on your website. ' + 'Open Graph Protocol Meta Tags.'), + ( 'og:url', + 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'), + ('og:site_name', 'Optimize Smart'), + ( 'og:updated_time', + '2018-03-09T16:26:35+00:00'), + ( 'og:image', + 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'), + ( 'og:image:secure_url', + 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg')]}], + 'rdfa': [ { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/#header', + 'http://www.w3.org/1999/xhtml/vocab#role': [ { '@id': 'http://www.w3.org/1999/xhtml/vocab#banner'}]}, + { '@id': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/', + 'article:modified_time': [ { '@value': '2018-03-09T16:26:35+00:00'}], + 'article:published_time': [ { '@value': '2010-07-02T18:57:23+00:00'}], + 'article:publisher': [ { '@value': 'https://www.facebook.com/optimizesmart/'}], + 'article:section': [{'@value': 'Specialized Tracking'}], + 'http://ogp.me/ns#description': [ { '@value': 'What is Open ' + 'Graph Protocol ' + 'and why you need ' + 'it? Learn to ' + 'implement Open ' + 'Graph Protocol ' + 'for Facebook on ' + 'your website. ' + 'Open Graph ' + 'Protocol Meta ' + 'Tags.'}], + 'http://ogp.me/ns#image': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], + 'http://ogp.me/ns#image:secure_url': [ { '@value': 'https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg'}], + 'http://ogp.me/ns#locale': [{'@value': 'en_US'}], + 'http://ogp.me/ns#site_name': [{'@value': 'Optimize Smart'}], + 'http://ogp.me/ns#title': [ { '@value': 'Open Graph Protocol for ' + 'Facebook explained with ' + 'examples'}], + 'http://ogp.me/ns#type': [{'@value': 'article'}], + 'http://ogp.me/ns#updated_time': [ { '@value': '2018-03-09T16:26:35+00:00'}], + 'http://ogp.me/ns#url': [ { '@value': 'https://www.optimizesmart.com/how-to-use-open-graph-protocol/'}], + 'https://api.w.org/': [ { '@id': 'https://www.optimizesmart.com/wp-json/'}]}]} Select syntaxes +++++++++++++++ -It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa', 'dublincore'. If no list is passed all syntaxes will be extracted and returned:: +It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa' and 'dublincore'. If no list is passed all syntaxes will be extracted and returned:: >>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields') >>> base_url = get_base_url(r.text, r.url) @@ -235,7 +221,7 @@ Uniform +++++++ Another option is to uniform the output of microformat, opengraph, microdata, dublincore and json-ld syntaxes to the following structure: :: - {'@context': 'http://example.com', + {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ } @@ -609,7 +595,7 @@ Microformat extraction ] } }] - + DublinCore extraction ++++++++++++++++++++++++++++++ :: @@ -621,8 +607,8 @@ DublinCore extraction ... Expressing Dublin Core in HTML/XHTML meta and link elements ... ... - ... - ... + ... + ... ... ... @@ -684,106 +670,6 @@ DublinCore extraction 'rel': 'DCTERMS.replaces'}]}] - -REST API service ----------------- - -*extruct* also ships with a REST API service to test its output from URLs. - -Dependencies -++++++++++++ - -* bottle_ (Web framework) -* gevent_ (Aysnc framework) -* requests_ - -.. _bottle: https://pypi.python.org/pypi/bottle -.. _gevent: http://www.gevent.org/ -.. _requests: http://docs.python-requests.org/ - -Usage -+++++ - -:: - - python -m extruct.service - -launches an HTTP server listening on port 10005. - -Methods supported -+++++++++++++++++ - -:: - - /extruct/ - method = GET - - - /extruct/batch - method = POST - params: - urls - a list of URLs separted by newlines - urlsfile - a file with one URL per line - -E.g. http://localhost:10005/extruct/http://www.sarenza.com/i-love-shoes-susket-s767163-p0000119412 - -will output something like this: - ->>> -{ 'json-ld': [ { '@context': 'http://schema.org', - '@id': 'FP', - '@type': 'Product', - 'brand': { '@type': 'Brand', - 'url': 'https://www.sarenza.com/i-love-shoes'}, - 'color': ['Lava', 'Black', 'Lt grey'], - 'image': [ 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_09.jpg?201509221045&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_02.jpg?201509291747&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_03.jpg?201509221045&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_04.jpg?201509221045&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_05.jpg?201509221045&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_06.jpg?201509221045&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_07.jpg?201509221045&v=20180313113923', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_08.jpg?201509221045&v=20180313113923'], - 'name': 'Susket', - 'offers': { '@type': 'AggregateOffer', - 'availability': 'InStock', - 'highPrice': '49.00', - 'lowPrice': '0.00', - 'price': '0.00', - 'priceCurrency': 'EUR'}}], - 'microdata': [ { 'properties': { 'average': '4.7', - 'best': '5', - 'itemreviewed': 'Sarenza', - 'rating': '4.7 / 5\n\t\t (4 066 avis)', - 'votes': '4 066'}, - 'type': 'http://data-vocabulary.org/Review-aggregate'}], - 'microformat': [], - 'opengraph': [ { 'namespace': {'og': 'http://ogp.me/ns#'}, - 'properties': [ ( 'og:title', - 'I Love Shoes Susket @sarenza.com'), - ( 'og:image', - 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_09.jpg?201509221045&v=20180313113923'), - ('og:site_name', 'sarenza.com'), - ('og:type', 'product'), - ('og:description', '...'), - ( 'og:url', - 'https://www.sarenza.com/i-love-shoes-susket-s767163-p0000119412'), - ('og:country-name', 'FRA')]}], - 'rdfa': [ { '@id': 'https://www.sarenza.com/i-love-shoes-susket-s767163-p0000119412', - 'http://ogp.me/ns#country-name': [{'@value': 'FRA'}], - 'http://ogp.me/ns#description': [{'@value': '...'}], - 'http://ogp.me/ns#image': [ { '@value': 'https://cdn.sarenza.net/_img/productsv4/0000119412/MD_0000119412_223992_09.jpg?201509221045&v=20180313113923'}], - 'http://ogp.me/ns#site_name': [{'@value': 'sarenza.com'}], - 'http://ogp.me/ns#title': [ { '@value': 'I Love Shoes Susket ' - '@sarenza.com'}], - 'http://ogp.me/ns#type': [{'@value': 'product'}], - 'http://ogp.me/ns#url': [ { '@value': 'https://www.sarenza.com/i-love-shoes-susket-s767163-p0000119412'}], - 'http://ogp.me/ns/fb#admins': [{'@value': '100001934697625'}], - 'http://ogp.me/ns/fb#app_id': [{'@value': '148128758532914'}]}, - { '@id': '_:Ncf1962068aa142b29000813372db7841', - 'http://www.w3.org/1999/xhtml/vocab#role': [ { '@id': 'http://www.w3.org/1999/xhtml/vocab#navigation'}]}]} - - Command Line Tool ----------------- @@ -821,7 +707,7 @@ those, you can pass their individual names collected in a list through 'syntaxes For example, this command extracts only Microdata and JSON-LD metadata from "http://example.com":: - extruct "http://example.com" --syntaxes microdata json-ld + extruct "http://example.com" --syntaxes microdata json-ld NB syntaxes names passed must correspond to these: microdata, json-ld, rdfa, opengraph, microformat @@ -847,16 +733,4 @@ Use tox_ to run tests with different Python versions:: tox -.. _tox: https://testrun.org/tox/latest/ - - -Versioning ----------- - -Use bumpversion_ to conveniently change project version:: - - bumpversion patch # 0.0.0 -> 0.0.1 - bumpversion minor # 0.0.1 -> 0.1.0 - bumpversion major # 0.1.0 -> 1.0.0 - -.. _bumpversion: https://pypi.python.org/pypi/bumpversion +.. _tox: https://testrun.org/tox/latest/ \ No newline at end of file diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 81a951f6..d5016901 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -147,7 +147,9 @@ def extract(htmlstring, for syntax, uniform, raw, schema_context in uniform_processors: try: - if syntax in ['opengraph', 'dublincore']: + if syntax == 'opengraph': + output[syntax] = uniform(raw, with_og_array=with_og_array) + elif syntax == 'dublincore': output[syntax] = uniform(raw) else: output[syntax] = uniform(raw, schema_context) diff --git a/extruct/uniform.py b/extruct/uniform.py index cf97717c..74f16587 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,3 +1,4 @@ +import copy from six.moves.urllib.parse import urlparse, urljoin from extruct.dublincore import get_lower_attrib @@ -26,11 +27,12 @@ def _umicrodata_microformat(extracted, schema_context): def _udublincore(extracted): out = [] - for obj in list(extracted): + extracted_cpy = copy.deepcopy(extracted) + for obj in extracted_cpy: context = obj.pop('namespaces', None) obj['@context'] = context elements = obj['elements'] - for element in list(elements): + for element in elements: for key, value in element.items(): if get_lower_attrib(value) == 'type': obj['@type'] = element['content'] From dee37e6dd649d2d6447da87b6b1824b28fdce63e Mon Sep 17 00:00:00 2001 From: Joaquin Date: Thu, 1 Oct 2020 23:07:05 -0500 Subject: [PATCH 10/11] update README.rst, normalize indentation --- README.rst | 2 +- tests/samples/songkick/tovestyrke.json | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/README.rst b/README.rst index 45d1f607..ee0ee2de 100644 --- a/README.rst +++ b/README.rst @@ -33,7 +33,7 @@ Currently, *extruct* supports: .. _Microformat: http://microformats.org/wiki/Main_Page .. _mf2py: https://github.com/microformats/mf2py .. _Facebook's Open Graph: http://ogp.me/ -.. _Dublin Core Metadata: http://dublincore.org/documents/dcq-html/ +.. _Dublin Core Metadata: https://www.dublincore.org/specifications/dublin-core/dcq-html/ The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions. diff --git a/tests/samples/songkick/tovestyrke.json b/tests/samples/songkick/tovestyrke.json index 4e47acce..068df67a 100644 --- a/tests/samples/songkick/tovestyrke.json +++ b/tests/samples/songkick/tovestyrke.json @@ -189,17 +189,18 @@ } ], "microformat": [], - "dublincore": [ + "dublincore": [ { - "namespaces": { - - }, - "elements": [ - { - "name": "description", - "content": "Past concert. Tove Styrke concert with Geowulf at Hoxton Square Bar & Kitchen in London on 12 Jun 2017.", - "URI": "http://purl.org/dc/elements/1.1/description" - }], - "terms": [ - ]}] + "namespaces": {}, + "elements": [ + { + "name": "description", + "content": "Past concert. Tove Styrke concert with Geowulf at Hoxton Square Bar & Kitchen in London on 12 Jun 2017.", + "URI": "http://purl.org/dc/elements/1.1/description" + } + ], + "terms": [ + ] + } + ] } \ No newline at end of file From 043a4795235ea4ba79b39467297f8a298a5d5606 Mon Sep 17 00:00:00 2001 From: Joaquin Date: Sun, 4 Oct 2020 16:41:05 -0500 Subject: [PATCH 11/11] Specify DC version, update link. --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index ee0ee2de..5cbc809a 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ Currently, *extruct* supports: - `Microformat`_ via `mf2py`_ - `Facebook's Open Graph`_ - (experimental) `RDFa`_ via `rdflib`_ -- `Dublin Core Metadata`_ +- `Dublin Core Metadata (DC-HTML-2003)`_ .. _W3C's HTML Microdata: http://www.w3.org/TR/microdata/ .. _embedded JSON-LD: http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents @@ -33,7 +33,7 @@ Currently, *extruct* supports: .. _Microformat: http://microformats.org/wiki/Main_Page .. _mf2py: https://github.com/microformats/mf2py .. _Facebook's Open Graph: http://ogp.me/ -.. _Dublin Core Metadata: https://www.dublincore.org/specifications/dublin-core/dcq-html/ +.. _Dublin Core Metadata (DC-HTML-2003): https://www.dublincore.org/specifications/dublin-core/dcq-html/2003-11-30/ The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions.