/
parse.py
80 lines (62 loc) · 2.47 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
import re
import logging
logger = logging.getLogger("portal_proxy")
#from lxml import etree
from BeautifulSoup import BeautifulStoneSoup
def check_dupes(tag,data,res):
"""Check whether an element/attribute already exists in a given parent node;
if not, add it to the parent; otherwise, create a list containing both.
@param tag: name of element or attribute to check for.
@param data: data contained in the element or attribute.
@param res: parent node to check/modify."""
tag = re.sub('^.*:','',tag)
if tag in res:
# this tag already exists. don't overwrite it!
if not isinstance(res[tag], list):
# not a list, turn it into one
res[tag] = [res[tag]]
# then append
res[tag].append(data)
else:
res[tag] = data
def _process(el,depth=0):
"""Recursively turn an lxml tree into a datastructure that should
more-or-less accurately represent it.
@param el: the lxml Element to process.
@param depth: number of recursions. Not currently checked.
@return: a datastructure representing the element."""
res = {}
for i in el.contents:
if 'name' in dir(i) and i.name:
data = _process(i,depth+1)
if hasattr(i, 'string') and i.string is not None:
text = i.string.strip().encode('utf-8')
if len(text) > 0:
if data == {}:
try:
data = text
except:
pass
else:
data.update({'text': text})
check_dupes(i.name,data,res)
if 'name' in dir(el):
for key,data in el._getAttrMap().iteritems():
check_dupes(key,data,res)
return res
def process(file, tag='Layer'):
"""Take a file and search for a given tag, returning a data structure representing it.
@param file: string containing xml to process.
@param tag: tagname for lxml to search for.
@return: list of dictionaries, one per tag found."""
logger.debug("parse.process: tag=%s" % tag)
selfClosingTags = ['boundingbox']
root = BeautifulStoneSoup(file, selfClosingTags=selfClosingTags)
logger.debug(root.findAll(tag))
obj = [_process(i) for i in root.findAll(tag)]
return obj
if __name__ == '__main__':
from sys import argv
import yaml
print yaml.dump(process(open(argv[1]).read(),argv[2]),default_flow_style=False)