-
Notifications
You must be signed in to change notification settings - Fork 26
/
_tupletree.py
512 lines (403 loc) · 19.4 KB
/
_tupletree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
#
# (C) Copyright 2003,2004 Hewlett-Packard Development Company, L.P.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# Author: Martin Pool <mbp@hp.com>
# Author: Ross Peoples <ross.peoples@gmail.com>
#
"""
_tupletree - Convert XML DOM objects to and from tuple trees.
DOM is the standard in-memory representation of XML documents, but it
is very cumbersome for some types of processing where XML encodes
object structures rather than text documents. Direct mapping to Python
classes may not be a good match either.
tupletrees may be created from an in-memory DOM using
dom_to_tupletree(), or from a string using xml_to_tupletree().
Since the Python XML libraries deal mostly with Unicode strings they
are also returned here. If plain Strings are passed in they will be
converted by xmldom.
Each node of the tuple tree is a Python 4-tuple, corresponding to an
XML Element (i.e. <tag>):
(NAME, ATTRS, CONTENTS, None)
The NAME is the name of the element.
The ATTRS are a name-value dictionary of element attributes, not preserving
order.
The CONTENTS is a list of child elements, preserving order.
The fourth element is reserved.
"""
# NOTE: The original dom based parsers have been replaced with a sax parser
# in Nov. 2016. The dom tupletree code is in the file
# tests/unittest/pywbem/test_tupletree.py
from __future__ import absolute_import
import xml.sax
import re
import sys
import six
from ._exceptions import XMLParseError
from ._utils import _format, _ensure_bytes
__all__ = []
class CIMContentHandler(xml.sax.ContentHandler):
"""SAX handler for CIM XML.
Similar to dom_to_tupletree, the handler creates a tree of tuples,
where the elements are the XML element name, the attributes, and the
children.
The handler pushes and pops the list of elements, building the child
list as it goes.
The end result is that the root node is left in the list and available
as the root attribute of the object.
"""
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.root = None
self.elements = []
self.element = []
def startDocument(self):
assert not self.elements
def endDocument(self):
assert not self.elements
self.root = self.element
def startElement(self, name, attrs):
# Note: attrs is a SAX Attributes object which implements a subset of
# dictionary methods, but does not preserve order. So this handler
# cannot preserve attribute order, because it is already lost when it
# gets control.
if self.element:
self.elements.append(self.element)
attr_dict = {} # No order preservation possible, see note above
for k, v in attrs.items():
attr_dict[k] = v
element = (name, attr_dict, [])
if self.element:
self.element[2].append(element)
self.element = element
def endElement(self, name):
if self.elements:
self.element = self.elements.pop()
def characters(self, content):
children = self.element[2] # mutable list
# If the last node is a character node, append the content to it.
# Otherwise, append a new character node with the content.
if children and isinstance(children[-1], six.text_type):
children[-1] += content
else:
children.append(content)
def xml_to_tupletree_sax(xml_string, meaning, conn_id=None):
"""
Parse an XML string into tupletree with SAX parser.
Parses the string using the class CIMContentHandler and
returns the root element. As a SAX parser it uses minimal
memory.
This is a replacement for the previous parser (xml_to_tuple)
which used the dom parser.
Parameters:
xml_string (:term:`string`): A unicode string (when called for embedded
objects) or UTF-8 encoded byte string (when called for CIM-XML
replies) containing the XML to be parsed.
meaning (:term:`string`):
Short text with meaning of the XML string, for messages in exceptions.
conn_id (:term:`connection id`): Connection ID to be used in any
exceptions that may be raised.
Returns:
tupletree tuple with parsed XML tree
Raises:
pywbem.XMLParseError: Error detected by SAX parser or UTF-8/XML checkers
"""
handler = CIMContentHandler()
# The following conversion to a byte string is required because:
# * xml.sax.parseString() raises UnicodeEncodeError for unicode strings
# that contain any non-ASCII characters (despite its Python 2.7
# documentation which states that would be supported).
xml_string = _ensure_bytes(xml_string)
try:
xml.sax.parseString(xml_string, handler, None)
except xml.sax.SAXParseException as exc:
# xml.sax.parse() is documented to only raise SAXParseException. In
# earlier versions of this code, xml.sax.parseString() has been found
# to raise UnicodeEncodeError when unicode strings were passed, but
# that is no longer done, so that exception is no longer caught.
# Other exception types are unexpected and will perculate upwards.
# Traceback of the exception that was caught
org_tb = sys.exc_info()[2]
# Improve quality of exception info (the check...() functions may
# raise XMLParseError):
_chk_str = check_invalid_utf8_sequences(xml_string, meaning, conn_id)
check_invalid_xml_chars(_chk_str, meaning, conn_id)
# If the checks above pass, re-raise the SAX exception info, with its
# original traceback info:
lineno, colno, new_colno, line = get_failing_line(xml_string, str(exc))
if lineno is not None:
marker_line = ' ' * (new_colno - 1) + '^'
xml_msg = _format(
"Line {0} column {1} of XML string (as binary UTF-8 string):\n"
"{2}\n"
"{3}",
lineno, colno, line, marker_line)
else:
xml_msg = _format(
"XML string (as binary UTF-8 string):\n"
"{0}",
line)
pe = XMLParseError(
_format("XML parsing error encountered in {0}: {1}\n{2}\n",
meaning, exc, xml_msg),
conn_id=conn_id)
six.reraise(type(pe), pe, org_tb) # ignore this call in traceback!
return handler.root
def truncate_line(line, colno, max_before, max_after):
"""
Truncate the line (binary string) so that left of the 1-based colno
poasition there are at most max_before characters, and right of the
colno position there are at most max_after characters.
If truncated, '...' is added before or after to indicate the truncation.
Returns a tuple (truncated line, new position)
"""
line_len = len(line)
new_colno = colno
truncated_after = False
truncated_before = False
len_after = line_len - colno
if len_after > max_after:
truncated_after = True
line = line[:colno + max_after]
len_before = colno - 1
if len_before > max_before:
truncated_before = True
line = line[len_before - max_before:]
new_colno -= len_before - max_before
line = _format("{0!A}", line)
new_colno += 1 # the leading single quote
if truncated_before:
line = '...' + line
new_colno += 3
if truncated_after:
line = line + '...'
return line, new_colno
def get_failing_line(xml_string, exc_msg):
"""
Extract the failing line from the XML string, as indicated by the
line/column information in the exception message.
Returns a tuple (lineno, colno, new_pos, line), where lineno and colno
and marker_pos may be None.
"""
max_before = 500 # max characters before reported position
max_after = 500 # max characters after reported position
max_unknown = 1000 # max characters when position cannot be determined
assert isinstance(xml_string, six.binary_type)
m = re.search(r':(\d+):(\d+):', exc_msg)
if not m:
xml_string, _ = truncate_line(xml_string, 1, 0, max_unknown - 1)
return None, None, None, xml_string
lineno = int(m.group(1))
colno = int(m.group(2))
if not xml_string.endswith(b'\n'):
xml_string += b'\n'
xml_lines = xml_string.splitlines()
if len(xml_lines) < lineno:
# This really should not happen; it means the line parsing went wrong
# or SAX reported incorrect lines. We do not particularly care for
# this case and simply truncate the string.
xml_string, _ = truncate_line(xml_string, 1, 0, max_unknown - 1)
return None, None, None, xml_string
line = xml_lines[lineno - 1]
line, new_pos = truncate_line(line, colno, max_before, max_after)
return lineno, colno, new_pos, line
# Patterns for check_invalid_utf8_sequences()
_ILL_FORMED_UTF8_RE = re.compile(
b'(\xED[\xA0-\xBF][\x80-\xBF])') # U+D800...U+DFFF
def check_invalid_utf8_sequences(utf8_string, meaning, conn_id=None):
"""
Examine a UTF-8 encoded string and raise a `pywbem.XMLParseError` exception
if the string contains invalid UTF-8 sequences (incorrectly encoded or
ill-formed).
This function works in both "wide" and "narrow" Unicode builds of Python
and supports the full range of Unicode characters from U+0000 to U+10FFFF.
This function is used to improve the error information raised from Python's
`xml.dom.minidom` and `xml.sax` packages and should be called only after
having catched an `ExpatError` from `xml.dom.minidom` or a
`SAXParseException` from `xml.sax` .
Parameters:
utf8_string (:term:`byte string`):
The UTF-8 encoded XML string to be examined.
meaning (:term:`string`):
Short text with meaning of the XML string, for messages in exceptions.
conn_id (:term:`connection id`): Connection ID to be used in any
exceptions that may be raised.
Returns:
:term:`unicode string`: The input string, converted to Unicode.
Raises:
TypeError: Invoked with incorrect Python object type for `utf8_xml`.
pywbem.XMLParseError: `utf8_xml` contains invalid UTF-8 sequences.
Notes on Unicode support in Python:
(1) For internally representing Unicode characters in the unicode type, a
"wide" Unicode build of Python uses UTF-32, while a "narrow" Unicode
build uses UTF-16. The difference is visible to Python programs for
Unicode characters assigned to code points above U+FFFF: The "narrow"
build uses 2 characters (a surrogate pair) for them, while the "wide"
build uses just 1 character. This affects all position- and
length-oriented functions, such as `len()` or string slicing.
(2) In a "wide" Unicode build of Python, the Unicode characters assigned to
code points U+10000 to U+10FFFF are represented directly (using code
points U+10000 to U+10FFFF) and the surrogate code points
U+D800...U+DFFF are never used; in a "narrow" Unicode build of Python,
the Unicode characters assigned to code points U+10000 to U+10FFFF are
represented using pairs of the surrogate code points U+D800...U+DFFF.
Notes on the Unicode code points U+D800...U+DFFF ("surrogate code points"):
(1) These code points have no corresponding Unicode characters assigned,
because they are reserved for surrogates in the UTF-16 encoding.
(2) The UTF-8 encoding can technically represent the surrogate code points.
ISO/IEC 10646 defines that a UTF-8 sequence containing the surrogate
code points is ill-formed, but it is technically possible that such a
sequence is in a UTF-8 encoded XML string.
(3) The Python escapes ``\\u`` and ``\\U`` used in literal strings can
represent the surrogate code points (as well as all other code points,
regardless of whether they are assigned to Unicode characters).
That is the case in both Python 2 and Python 3.
(4) In Python 2, the `unicode.encode()` and `str.decode()` methods
successfully translate surrogate code points back and forth for
encoding UTF-8, tolerating invalid surrogate sequences.
For example, ``b'\\xed\\xb0\\x80'.decode("utf-8") = u'\\udc00'``.
In Python 3, the `str.encode()` and `bytes.decode()` methods raise
UnicodeEncodeError / UnicodeDecodeError for invalid surrogate
sequences. However, the `codecs.encode()` and `codecs.decode()`
methods have an error handler 'surrogatepass' which tolerates
invalid surrogate sequences.
(5) Because Python 2 supports the encoding and decoding of UTF-8 sequences
also for the surrogate code points, the "narrow" Unicode build of
Python 2 can be (mis-)used to transport each surrogate unit separately
encoded in (ill-formed) UTF-8.
For example, code point U+10122 can be (illegally) created from a
sequence of code points U+D800,U+DD22 represented in UTF-8:
``b'\\xED\\xA0\\x80\\xED\\xB4\\xA2'.decode("utf-8") = u'\\U00010122'``
while the correct UTF-8 sequence for this code point is:
``u'\\U00010122'.encode("utf-8") = b'\\xf0\\x90\\x84\\xa2'``
"""
context_before = 16 # number of chars to print before any bad chars
context_after = 16 # number of chars to print after any bad chars
try:
assert isinstance(utf8_string, six.binary_type)
except AssertionError:
raise TypeError(
_format("utf8_string parameter is not a byte string, but has "
"type {0}", type(utf8_string)))
# Check for ill-formed UTF-8 sequences. This needs to be done
# before the str type gets decoded to unicode, because afterwards
# surrogates produced from ill-formed UTF-8 cannot be distinguished from
# legally produced surrogates (for code points above U+FFFF).
ifs_list = []
for m in _ILL_FORMED_UTF8_RE.finditer(utf8_string):
ifs_pos = m.start(1)
ifs_seq = m.group(1)
ifs_list.append((ifs_pos, ifs_seq))
if ifs_list:
exc_txt = _format("Ill-formed (surrogate) UTF-8 Byte sequences found "
"in {0}:", meaning)
for (ifs_pos, ifs_seq) in ifs_list:
exc_txt += "\n At offset {0}:".format(ifs_pos)
for ifs_ord in six.iterbytes(ifs_seq):
exc_txt += " 0x{0:02X}".format(ifs_ord)
cpos1 = max(ifs_pos - context_before, 0)
cpos2 = min(ifs_pos + context_after, len(utf8_string))
exc_txt += _format(", CIM-XML snippet: {0!A}",
utf8_string[cpos1:cpos2])
raise XMLParseError(exc_txt, conn_id=conn_id)
# Check for incorrectly encoded UTF-8 sequences.
# @ibm.13@ Simplified logic (removed loop).
try:
utf8_string_u = utf8_string.decode("utf-8")
except UnicodeDecodeError as exc:
# Only raised for incorrectly encoded UTF-8 sequences; technically
# correct sequences that are ill-formed (e.g. representing surrogates)
# do not cause this exception to be raised.
# If more than one incorrectly encoded sequence is present, only
# information about the first one is returned in the exception object.
# Also, the stated reason (in _msg) is not always correct.
# pylint: disable=unbalanced-tuple-unpacking
unused_codec, unused_str, _p1, _p2, unused_msg = exc.args
exc_txt = "Incorrectly encoded UTF-8 Byte sequences found in {0}". \
format(meaning)
exc_txt += "\n At offset {0}:".format(_p1)
ies_seq = utf8_string[_p1:_p2 + 1]
for ies_ord in six.iterbytes(ies_seq):
exc_txt += " 0x{0:02X}".format(ies_ord)
cpos1 = max(_p1 - context_before, 0)
cpos2 = min(_p2 + context_after, len(utf8_string))
exc_txt += _format(", CIM-XML snippet: {0!A}",
utf8_string[cpos1:cpos2])
raise XMLParseError(exc_txt, conn_id=conn_id)
return utf8_string_u
# Patterns for check_invalid_xml_chars()
if len(u'\U00010122') == 2:
# This is a "narrow" Unicode build of Python (the normal case).
_ILLEGAL_XML_CHARS_RE = re.compile(
u'([\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE\uFFFF])')
else:
# This is a "wide" Unicode build of Python.
_ILLEGAL_XML_CHARS_RE = re.compile(
u'([\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE\uFFFF])')
def check_invalid_xml_chars(xml_string, meaning, conn_id=None):
"""
Examine an XML string and raise a `pywbem.XMLParseError` exception if the
string contains characters that cannot legally be represented as XML
characters.
This function is used to improve the error information raised from Python's
`xml.dom.minidom` and `xml.sax` packages and should be called only after
having catched an `ExpatError` from `xml.dom.minidom` or a
`SAXParseException` from `xml.sax` .
Parameters:
xml_string (:term:`unicode string`):
The XML string to be examined.
meaning (:term:`string`):
Short text with meaning of the XML string, for messages in exceptions.
conn_id (:term:`connection id`): Connection ID to be used in any
exceptions that may be raised.
Raises:
TypeError: Invoked with incorrect Python object type for `xml_string`.
pywbem.XMLParseError: `xml_string` contains invalid XML characters.
Notes on XML characters:
(1) The legal XML characters are defined in W3C XML 1.0 (Fith Edition):
::
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
These are the code points of Unicode characters using a non-surrogate
representation.
"""
context_before = 16 # number of chars to print before any bad chars
context_after = 16 # number of chars to print after any bad chars
try:
assert isinstance(xml_string, six.text_type)
except AssertionError:
raise TypeError(
_format("xml_string parameter is not a unicode string, but has "
"type {0}", type(xml_string)))
# Check for Unicode characters that cannot legally be represented as XML
# characters.
ixc_list = []
last_ixc_pos = -2
for m in _ILLEGAL_XML_CHARS_RE.finditer(xml_string):
ixc_pos = m.start(1)
ixc_char = m.group(1)
if ixc_pos > last_ixc_pos + 1:
ixc_list.append((ixc_pos, ixc_char))
last_ixc_pos = ixc_pos
if ixc_list:
exc_txt = "Invalid XML characters found in {0}:".format(meaning)
for (ixc_pos, ixc_char) in ixc_list:
cpos1 = max(ixc_pos - context_before, 0)
cpos2 = min(ixc_pos + context_after, len(xml_string))
exc_txt += _format("\n At offset {0}: U+{1:04X}, "
"CIM-XML snippet: {2!A}",
ixc_pos, ord(ixc_char), xml_string[cpos1:cpos2])
raise XMLParseError(exc_txt, conn_id=conn_id)