diff --git a/README.rst b/README.rst
index f6d92454..5cbc809a 100644
--- a/README.rst
+++ b/README.rst
@@ -24,6 +24,7 @@ Currently, *extruct* supports:
- `Microformat`_ via `mf2py`_
- `Facebook's Open Graph`_
- (experimental) `RDFa`_ via `rdflib`_
+- `Dublin Core Metadata (DC-HTML-2003)`_
.. _W3C's HTML Microdata: http://www.w3.org/TR/microdata/
.. _embedded JSON-LD: http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents
@@ -32,6 +33,7 @@ Currently, *extruct* supports:
.. _Microformat: http://microformats.org/wiki/Main_Page
.. _mf2py: https://github.com/microformats/mf2py
.. _Facebook's Open Graph: http://ogp.me/
+.. _Dublin Core Metadata (DC-HTML-2003): https://www.dublincore.org/specifications/dublin-core/dcq-html/2003-11-30/
The microdata algorithm is a revisit of `this Scrapinghub blog post`_ showing how to use EXSLT extensions.
@@ -71,7 +73,17 @@ First fetch the HTML using python-requests and then feed the response body to ``
>>> data = extruct.extract(r.text, base_url=base_url)
>>>
>>> pp.pprint(data)
- { 'json-ld': [ { '@context': 'https://schema.org',
+ { 'dublincore': [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/description',
+ 'content': 'What is Open Graph Protocol '
+ 'and why you need it? Learn to '
+ 'implement Open Graph Protocol '
+ 'for Facebook on your website. '
+ 'Open Graph Protocol Meta Tags.',
+ 'name': 'description'}],
+ 'namespaces': {},
+ 'terms': []}],
+
+ 'json-ld': [ { '@context': 'https://schema.org',
'@id': '#organization',
'@type': 'Organization',
'logo': 'https://www.optimizesmart.com/wp-content/uploads/2016/03/optimize-smart-Twitter-logo.jpg',
@@ -163,7 +175,7 @@ First fetch the HTML using python-requests and then feed the response body to ``
Select syntaxes
+++++++++++++++
-It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa'. If no list is passed all syntaxes will be extracted and returned::
+It is possible to select which syntaxes to extract by passing a list with the desired ones to extract. Valid values: 'microdata', 'json-ld', 'opengraph', 'microformat', 'rdfa' and 'dublincore'. If no list is passed all syntaxes will be extracted and returned::
>>> r = requests.get('http://www.songkick.com/artists/236156-elysian-fields')
>>> base_url = get_base_url(r.text, r.url)
@@ -207,9 +219,9 @@ It is possible to select which syntaxes to extract by passing a list with the de
Uniform
+++++++
-Another option is to uniform the output of microformat, opengraph, microdata and json-ld syntaxes to the following structure: ::
+Another option is to uniform the output of microformat, opengraph, microdata, dublincore and json-ld syntaxes to the following structure: ::
- {'@context': 'http://example.com',
+ {'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}
@@ -584,6 +596,80 @@ Microformat extraction
}
}]
+DublinCore extraction
+++++++++++++++++++++++++++++++
+::
+
+ >>> import pprint
+ >>> pp = pprint.PrettyPrinter(indent=2)
+ >>> from extruct.dublincore import DublinCoreExtractor
+ >>> html = '''
+ ... Expressing Dublin Core in HTML/XHTML meta and link elements
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ... '''
+ >>> dublinlde = DublinCoreExtractor()
+ >>> data = dublinlde.extract(html)
+ >>> pp.pprint(data)
+ [ { 'elements': [ { 'URI': 'http://purl.org/dc/elements/1.1/title',
+ 'content': 'Expressing Dublin Core\n'
+ 'in HTML/XHTML meta and link elements',
+ 'lang': 'en',
+ 'name': 'DC.title'},
+ { 'URI': 'http://purl.org/dc/elements/1.1/creator',
+ 'content': 'Andy Powell, UKOLN, University of Bath',
+ 'name': 'DC.creator'},
+ { 'URI': 'http://purl.org/dc/elements/1.1/identifier',
+ 'content': 'http://dublincore.org/documents/dcq-html/',
+ 'name': 'DC.identifier',
+ 'scheme': 'DCTERMS.URI'},
+ { 'URI': 'http://purl.org/dc/elements/1.1/format',
+ 'content': 'text/html',
+ 'name': 'DC.format',
+ 'scheme': 'DCTERMS.IMT'},
+ { 'URI': 'http://purl.org/dc/elements/1.1/type',
+ 'content': 'Text',
+ 'name': 'DC.type',
+ 'scheme': 'DCTERMS.DCMIType'}],
+ 'namespaces': { 'DC': 'http://purl.org/dc/elements/1.1/',
+ 'DCTERMS': 'http://purl.org/dc/terms/'},
+ 'terms': [ { 'URI': 'http://purl.org/dc/terms/issued',
+ 'content': '2003-11-01',
+ 'name': 'DCTERMS.issued',
+ 'scheme': 'DCTERMS.W3CDTF'},
+ { 'URI': 'http://purl.org/dc/terms/abstract',
+ 'content': 'This document describes how\n'
+ 'qualified Dublin Core metadata can be encoded\n'
+ 'in HTML/XHTML elements',
+ 'name': 'DCTERMS.abstract'},
+ { 'URI': 'http://purl.org/dc/terms/modified',
+ 'content': '2001-07-18',
+ 'name': 'DC.Date.modified'},
+ { 'URI': 'http://purl.org/dc/terms/modified',
+ 'content': '2001-07-18',
+ 'name': 'DCTERMS.modified'},
+ { 'URI': 'http://purl.org/dc/terms/replaces',
+ 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/',
+ 'hreflang': 'en',
+ 'rel': 'DCTERMS.replaces'}]}]
+
+
Command Line Tool
-----------------
@@ -622,7 +708,7 @@ those, you can pass their individual names collected in a list through 'syntaxes
For example, this command extracts only Microdata and JSON-LD metadata from
"http://example.com"::
- extruct "http://example.com" --syntaxes microdata json-ld
+ extruct "http://example.com" --syntaxes microdata json-ld
NB syntaxes names passed must correspond to these: microdata, json-ld, rdfa, opengraph, microformat
diff --git a/extruct/_extruct.py b/extruct/_extruct.py
index adbe7320..5bc247f8 100644
--- a/extruct/_extruct.py
+++ b/extruct/_extruct.py
@@ -6,11 +6,12 @@
from extruct.w3cmicrodata import MicrodataExtractor
from extruct.opengraph import OpenGraphExtractor
from extruct.microformat import MicroformatExtractor
-from extruct.uniform import _umicrodata_microformat, _uopengraph
+from extruct.dublincore import DublinCoreExtractor
+from extruct.uniform import _umicrodata_microformat, _uopengraph, _udublincore
from extruct.utils import parse_xmldom_html
logger = logging.getLogger(__name__)
-SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
+SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore']
def extract(htmlstring,
@@ -96,6 +97,11 @@ def extract(htmlstring,
('rdfa', RDFaExtractor().extract_items,
tree,
))
+ if 'dublincore' in syntaxes:
+ processors.append(
+ ('dublincore', DublinCoreExtractor().extract_items,
+ tree,
+ ))
output = {}
for syntax, extract, document in processors:
try:
@@ -132,10 +138,20 @@ def extract(htmlstring,
output['opengraph'],
None,
))
+ if 'dublincore' in syntaxes:
+ uniform_processors.append(
+ ('dublincore',
+ _udublincore,
+ output['dublincore'],
+ None,
+ ))
+
for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
output[syntax] = uniform(raw, with_og_array=with_og_array)
+ elif syntax == 'dublincore':
+ output[syntax] = uniform(raw)
else:
output[syntax] = uniform(raw, schema_context)
except Exception as e:
diff --git a/extruct/dublincore.py b/extruct/dublincore.py
new file mode 100644
index 00000000..3e05209e
--- /dev/null
+++ b/extruct/dublincore.py
@@ -0,0 +1,156 @@
+import re
+
+from extruct.utils import parse_html
+from w3lib.html import strip_html5_whitespace
+
+_DC_ELEMENTS = { # Defined according DCMES(DCM Version 1.1): http://dublincore.org/documents/dces/
+ 'contributor': 'http://purl.org/dc/elements/1.1/contributor',
+ 'coverage': 'http://purl.org/dc/elements/1.1/coverage',
+ 'creator': 'http://purl.org/dc/elements/1.1/creator',
+ 'date': 'http://purl.org/dc/elements/1.1/date',
+ 'description': 'http://purl.org/dc/elements/1.1/description',
+ 'format': 'http://purl.org/dc/elements/1.1/format',
+ 'identifier': 'http://purl.org/dc/elements/1.1/identifier',
+ 'language': 'http://purl.org/dc/elements/1.1/language',
+ 'publisher': 'http://purl.org/dc/elements/1.1/publiser',
+ 'relation': 'http://purl.org/dc/elements/1.1/relation',
+ 'rights': 'http://purl.org/dc/elements/1.1/rights',
+ 'source': 'http://purl.org/dc/elements/1.1/source',
+ 'subject': 'http://purl.org/dc/elements/1.1/subject',
+ 'title': 'http://purl.org/dc/elements/1.1/title',
+ 'type': 'http://purl.org/dc/elements/1.1/type',
+}
+
+_DC_TERMS = { # Defined according: http://dublincore.org/documents/2008/01/14/dcmi-terms/
+ 'abstract': 'http://purl.org/dc/terms/abstract',
+ 'description': 'http://purl.org/dc/terms/description',
+ 'accessrights': 'http://purl.org/dc/terms/accessRights',
+ 'rights': 'http://purl.org/dc/terms/rights',
+ 'rightsstatement': 'http://purl.org/dc/terms/RightsStatement',
+ 'accrualmethod': 'http://purl.org/dc/terms/accrualMethod',
+ 'collection': 'http://purl.org/dc/terms/Collection',
+ 'methodOfaccrual': 'http://purl.org/dc/terms/MethodOfAccrual',
+ 'accrualperiodicity': 'http://purl.org/dc/terms/accrualPeriodicity',
+ 'frequency': 'http://purl.org/dc/terms/Frequency',
+ 'accrualpolicy': 'http://purl.org/dc/terms/accrualPolicy',
+ 'policy': 'http://purl.org/dc/terms/Policy',
+ 'alternative': 'http://purl.org/dc/terms/alternative',
+ 'title': 'http://purl.org/dc/terms/title',
+ 'audience': 'http://purl.org/dc/terms/audience',
+ 'agentclass': 'http://purl.org/dc/terms/AgentClass',
+ 'available': 'http://purl.org/dc/terms/available',
+ 'date': 'http://purl.org/dc/terms/date',
+ 'bibliographiccitation': 'http://purl.org/dc/terms/bibliographicCitation',
+ 'identifier': 'http://purl.org/dc/terms/identifier',
+ 'bibliographicresource': 'http://purl.org/dc/terms/BibliographicResource',
+ 'conformsto': 'http://purl.org/dc/terms/conformsTo',
+ 'relation': 'http://purl.org/dc/terms/relation',
+ 'standard': 'http://purl.org/dc/terms/Standard',
+ 'contributor': 'http://purl.org/dc/terms/contributor',
+ 'agent': 'http://purl.org/dc/terms/Agent',
+ 'coverage': 'http://purl.org/dc/terms/coverage',
+ 'locationperiodorjurisdiction': 'http://purl.org/dc/terms/LocationPeriodOrJurisdiction',
+ 'created': 'http://purl.org/dc/terms/created',
+ 'creator': 'http://purl.org/dc/terms/creator',
+ 'dateaccepted': 'http://purl.org/dc/terms/dateAccepted',
+ 'datecopyrighted': 'http://purl.org/dc/terms/dateCopyrighted',
+ 'datesubmitted': 'http://purl.org/dc/terms/dateSubmitted',
+ 'educationlevel': 'http://purl.org/dc/terms/educationLevel',
+ 'extent': 'http://purl.org/dc/terms/extent',
+ 'format': 'http://purl.org/dc/terms/format',
+ 'sizeorduration': 'http://purl.org/dc/terms/SizeOrDuration',
+ 'mediatypeorextent': 'http://purl.org/dc/terms/MediaTypeOrExtent',
+ 'hasformat': 'http://purl.org/dc/terms/hasFormat',
+ 'haspart': 'http://purl.org/dc/terms/hasPart',
+ 'hasversion': 'http://purl.org/dc/terms/hasVersion',
+ 'instructionalmethod': 'http://purl.org/dc/terms/instructionalMethod',
+ 'methodofinstruction': 'http://purl.org/dc/terms/MethodOfInstruction',
+ 'isformatof': 'http://purl.org/dc/terms/isFormatOf',
+ 'ispartof': 'http://purl.org/dc/terms/isPartOf',
+ 'isreferencedby': 'http://purl.org/dc/terms/isReferencedBy',
+ 'isreplacedby': 'http://purl.org/dc/terms/isReplacedBy',
+ 'isrequiredby': 'http://purl.org/dc/terms/isRequiredBy',
+ 'issued': 'http://purl.org/dc/terms/issued',
+ 'isversionof': 'http://purl.org/dc/terms/isVersionOf',
+ 'language': 'http://purl.org/dc/terms/language',
+ 'linguisticsystem': 'http://purl.org/dc/terms/LinguisticSystem',
+ 'license': 'http://purl.org/dc/terms/license',
+ 'licensedocument': 'http://purl.org/dc/terms/LicenseDocument',
+ 'mediator': 'http://purl.org/dc/terms/mediator',
+ 'medium': 'http://purl.org/dc/terms/medium',
+ 'physicalresource': 'http://purl.org/dc/terms/PhysicalResource',
+ 'physicalmedium': 'http://purl.org/dc/terms/PhysicalMedium',
+ 'modified': 'http://purl.org/dc/terms/modified',
+ 'provenance': 'http://purl.org/dc/terms/provenance',
+ 'provenancestatement': 'http://purl.org/dc/terms/ProvenanceStatement',
+ 'publisher': 'http://purl.org/dc/terms/publisher',
+ 'references': 'http://purl.org/dc/terms/references',
+ 'replaces': 'http://purl.org/dc/terms/replaces',
+ 'requires': 'http://purl.org/dc/terms/requires',
+ 'rightsholder': 'http://purl.org/dc/terms/rightsHolder',
+ 'source': 'http://purl.org/dc/terms/source',
+ 'spatial': 'http://purl.org/dc/terms/spatial',
+ 'location': 'http://purl.org/dc/terms/Location',
+ 'subject': 'http://purl.org/dc/terms/subject',
+ 'tableofcontents': 'http://purl.org/dc/terms/tableOfContents',
+ 'temporal': 'http://purl.org/dc/terms/temporal',
+ 'periodoftime': 'http://purl.org/dc/terms/PeriodOfTime',
+ 'type': 'http://purl.org/dc/terms/type',
+ 'valid': 'http://purl.org/dc/terms/valid',
+}
+
+_URL_NAMESPACES = ['http://purl.org/dc/terms/', 'http://purl.org/dc/elements/1.1/']
+
+
+def get_lower_attrib(name):
+ # get attribute to compare against _DC_TERMS or _DC_ELEMENTS
+ return re.sub(r".*\.", "", name).lower()
+
+
+class DublinCoreExtractor(object):
+ """DublinCore extractor following extruct API."""
+
+ def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
+ tree = parse_html(htmlstring, encoding=encoding)
+ return list(self.extract_items(tree, base_url=base_url))
+
+ def extract_items(self, document, base_url=None):
+ elements = []
+ terms = []
+
+ def attrib_to_dict(attribs):
+ # convert _attrib type to dict
+ return dict(attribs.items())
+
+ def populate_results(node, main_attrib):
+ # fill list with DC Elements or DC Terms
+ node_attrib = node.attrib
+ if main_attrib not in node_attrib:
+ return
+
+ name = node.attrib[main_attrib]
+ lower_name = get_lower_attrib(name)
+ if lower_name in _DC_ELEMENTS:
+ node.attrib.update({'URI': _DC_ELEMENTS[lower_name]})
+ elements.append(attrib_to_dict(node.attrib))
+
+ elif lower_name in _DC_TERMS:
+ node.attrib.update({'URI': _DC_TERMS[lower_name]})
+ terms.append(attrib_to_dict(node.attrib))
+
+ namespaces_nodes = document.xpath('//link[contains(@rel,"schema")]')
+ namespaces = {}
+ for i in namespaces_nodes:
+ url = strip_html5_whitespace(i.attrib['href'])
+ if url in _URL_NAMESPACES:
+ namespaces.update({re.sub(r"schema\.", "", i.attrib['rel']): url})
+
+ list_meta_node = document.xpath('//meta')
+ for meta_node in list_meta_node:
+ populate_results(meta_node, 'name')
+
+ list_link_node = document.xpath('//link')
+ for link_node in list_link_node:
+ populate_results(link_node, 'rel')
+
+ yield {'namespaces': namespaces, 'elements': elements, 'terms': terms}
diff --git a/extruct/uniform.py b/extruct/uniform.py
index 1b5de7ed..0fac40ee 100644
--- a/extruct/uniform.py
+++ b/extruct/uniform.py
@@ -1,4 +1,6 @@
+import copy
from six.moves.urllib.parse import urlparse, urljoin
+from extruct.dublincore import get_lower_attrib
def _uopengraph(extracted, with_og_array=False):
@@ -42,6 +44,23 @@ def _umicrodata_microformat(extracted, schema_context):
return res
+def _udublincore(extracted):
+ out = []
+ extracted_cpy = copy.deepcopy(extracted)
+ for obj in extracted_cpy:
+ context = obj.pop('namespaces', None)
+ obj['@context'] = context
+ elements = obj['elements']
+ for element in elements:
+ for key, value in element.items():
+ if get_lower_attrib(value) == 'type':
+ obj['@type'] = element['content']
+ obj['elements'].remove(element)
+ break
+ out.append(obj)
+ return out
+
+
def _flatten(element, schema_context):
if isinstance(element, dict):
element = flatten_dict(element, schema_context, False)
diff --git a/tests/samples/misc/dublincore_test.html b/tests/samples/misc/dublincore_test.html
new file mode 100644
index 00000000..44a192ce
--- /dev/null
+++ b/tests/samples/misc/dublincore_test.html
@@ -0,0 +1,21 @@
+
+Expressing Dublin Core in HTML/XHTML meta and link elements
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/samples/misc/dublincore_test.json b/tests/samples/misc/dublincore_test.json
new file mode 100644
index 00000000..7dbb9e5e
--- /dev/null
+++ b/tests/samples/misc/dublincore_test.json
@@ -0,0 +1,22 @@
+[
+ {
+ "namespaces": {
+ "DC": "http://purl.org/dc/elements/1.1/",
+ "DCTERMS": "http://purl.org/dc/terms/"
+ },
+ "elements": [
+ {"name": "DC.title", "lang": "en", "content": "Expressing Dublin Core\nin HTML/XHTML meta and link elements", "URI": "http://purl.org/dc/elements/1.1/title"},
+ {"name": "DC.creator", "content": "Andy Powell, UKOLN, University of Bath", "URI": "http://purl.org/dc/elements/1.1/creator"},
+ {"name": "DC.identifier", "scheme": "DCTERMS.URI", "content": "http://dublincore.org/documents/dcq-html/", "URI": "http://purl.org/dc/elements/1.1/identifier"},
+ {"name": "DC.format", "scheme": "DCTERMS.IMT", "content": "text/html", "URI": "http://purl.org/dc/elements/1.1/format"},
+ {"name": "DC.type", "scheme": "DCTERMS.DCMIType", "content": "Text", "URI": "http://purl.org/dc/elements/1.1/type"}
+ ],
+ "terms": [
+ {"name": "DCTERMS.issued", "scheme": "DCTERMS.W3CDTF", "content": "2003-11-01", "URI": "http://purl.org/dc/terms/issued"},
+ {"name": "DCTERMS.abstract", "content": "This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML elements", "URI": "http://purl.org/dc/terms/abstract"},
+ {"name": "DC.Date.modified", "content": "2001-07-18", "URI": "http://purl.org/dc/terms/modified"},
+ {"name": "DCTERMS.modified", "content": "2001-07-18", "URI": "http://purl.org/dc/terms/modified"},
+ {"rel": "DCTERMS.replaces", "hreflang": "en", "href": "http://dublincore.org/documents/2000/08/15/dcq-html/", "URI": "http://purl.org/dc/terms/replaces"}
+ ]
+ }
+]
diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json
index 70b51c63..61dae29d 100644
--- a/tests/samples/songkick/elysianfields.json
+++ b/tests/samples/songkick/elysianfields.json
@@ -268,5 +268,20 @@
}
]
}
+ ],
+ "dublincore": [
+ {
+ "namespaces": {
+ },
+ "elements": [
+ {
+ "name": "description",
+ "content": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.",
+ "URI": "http://purl.org/dc/elements/1.1/description"
+ }
+ ],
+ "terms": [
+ ]
+ }
]
}
diff --git a/tests/samples/songkick/tovestyrke.json b/tests/samples/songkick/tovestyrke.json
index 7e17abdd..068df67a 100644
--- a/tests/samples/songkick/tovestyrke.json
+++ b/tests/samples/songkick/tovestyrke.json
@@ -188,5 +188,19 @@
]
}
],
- "microformat": []
+ "microformat": [],
+ "dublincore": [
+ {
+ "namespaces": {},
+ "elements": [
+ {
+ "name": "description",
+ "content": "Past concert. Tove Styrke concert with Geowulf at Hoxton Square Bar & Kitchen in London on 12 Jun 2017.",
+ "URI": "http://purl.org/dc/elements/1.1/description"
+ }
+ ],
+ "terms": [
+ ]
+ }
+ ]
}
\ No newline at end of file
diff --git a/tests/test_dublincore.py b/tests/test_dublincore.py
new file mode 100644
index 00000000..a11ce603
--- /dev/null
+++ b/tests/test_dublincore.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+import json
+import unittest
+
+from extruct.dublincore import DublinCoreExtractor
+from tests import get_testdata, jsonize_dict
+
+
+class TestDublincore(unittest.TestCase):
+
+ maxDiff = None
+
+ def test_dublincore(self):
+ body = get_testdata('misc', 'dublincore_test.html')
+ expected = json.loads(get_testdata('misc', 'dublincore_test.json').decode('UTF-8'))
+
+ dublincorext = DublinCoreExtractor()
+ data = dublincorext.extract(body)
+ self.assertEqual(jsonize_dict(data), expected)
diff --git a/tests/test_uniform.py b/tests/test_uniform.py
index 7a9f29af..9b7acf7a 100644
--- a/tests/test_uniform.py
+++ b/tests/test_uniform.py
@@ -165,6 +165,47 @@ def test_umicrodata(self):
data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
self.assertEqual(data['microdata'], expected)
+ def test_udublincore(self):
+ expected = [{'elements': [{'name': 'DC.title',
+ 'lang': 'en',
+ 'content': 'Expressing Dublin Core\nin HTML/XHTML meta and link elements',
+ 'URI': 'http://purl.org/dc/elements/1.1/title'},
+ {'name': 'DC.creator',
+ 'content': 'Andy Powell, UKOLN, University of Bath',
+ 'URI': 'http://purl.org/dc/elements/1.1/creator'},
+ {'name': 'DC.identifier',
+ 'scheme': 'DCTERMS.URI',
+ 'content': 'http://dublincore.org/documents/dcq-html/',
+ 'URI': 'http://purl.org/dc/elements/1.1/identifier'},
+ {'name': 'DC.format',
+ 'scheme': 'DCTERMS.IMT',
+ 'content': 'text/html',
+ 'URI': 'http://purl.org/dc/elements/1.1/format'}],
+ 'terms': [{'name': 'DCTERMS.issued',
+ 'scheme': 'DCTERMS.W3CDTF',
+ 'content': '2003-11-01',
+ 'URI': 'http://purl.org/dc/terms/issued'},
+ {'name': 'DCTERMS.abstract',
+ 'content': 'This document describes how\nqualified Dublin Core metadata can be encoded\nin HTML/XHTML elements',
+ 'URI': 'http://purl.org/dc/terms/abstract'},
+ {'name': 'DC.Date.modified',
+ 'content': '2001-07-18',
+ 'URI': 'http://purl.org/dc/terms/modified'},
+ {'name': 'DCTERMS.modified',
+ 'content': '2001-07-18',
+ 'URI': 'http://purl.org/dc/terms/modified'},
+ {'rel': 'DCTERMS.replaces',
+ 'hreflang': 'en',
+ 'href': 'http://dublincore.org/documents/2000/08/15/dcq-html/',
+ 'URI': 'http://purl.org/dc/terms/replaces'}],
+ '@context': {'DC': 'http://purl.org/dc/elements/1.1/',
+ 'DCTERMS': 'http://purl.org/dc/terms/'},
+ '@type': 'Text'}]
+ body = get_testdata('misc', 'dublincore_test.html')
+ data = extruct.extract(body, syntaxes=['dublincore'], uniform=True)
+ self.assertEqual(data['dublincore'], expected)
+
+
def test_infer_context(self):
context = 'http://schema.org/UsedCondition'