From d18a6d140a6fd35adb9d72b708e9e65a5e0d2fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Moreira?= Date: Sat, 25 Mar 2017 22:06:58 -0300 Subject: [PATCH] Add cloneNode and toxml as some pages fail when extracting RDFa --- extruct/rdfa.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 2298dca4..365c5eec 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -9,12 +9,14 @@ from xml.dom import Node from xml.dom.minidom import Attr, NamedNodeMap -from lxml.etree import ElementBase, _ElementStringResult, _ElementUnicodeResult, XPath +from lxml.etree import ElementBase, _ElementStringResult, _ElementUnicodeResult, XPath, tostring from lxml.html import fromstring, HTMLParser, HtmlElementClassLookup from rdflib import Graph, logger as rdflib_logger from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context +from copy import deepcopy, copy + # silence rdflib/PyRdfa INFO logs rdflib_logger.setLevel(logging.ERROR) pyrdfa_logger.setLevel(logging.ERROR) @@ -71,6 +73,7 @@ class DomHtmlMixin(object): TEXT_NODE = Node.TEXT_NODE _xp_childrennodes = XPath('child::node()') + @property def documentElement(self): return self.getroottree().getroot() @@ -101,6 +104,9 @@ def getAttribute(self, name): def setAttribute(self, name, value): self.set(name, value) + def cloneNode(self, deep): + return deepcopy(self) if deep else copy(self) + @property def attributes(self): attrs = {} @@ -152,6 +158,10 @@ def data(self): else: raise RuntimeError + def toxml(self, encoding=None): + return tostring(self, encoding=encoding) + + class DomHtmlElementClassLookup(HtmlElementClassLookup): _lookups = {}