In [1]:
import xml.etree.cElementTree as ET
import pprint
from datetime import datetime
import csv

In [2]:
FILENAME = "../files/map-8.xml"

In [3]:
def gather_element_counts_levels(root, level=0, tag_counts={}, tag_levels={}):
    """
    This is a utility function that will recursively get the root element of the
    XML file and will find all the tags being used in the XML. It will
    find the level of each tag(root tag being at level 1), and also count
    the number of times the tags appeared.

    :param root: Root element of the XML
    :param level: Level of the root
    :param tags: A dictionary object representing tag counts
    :param levels: A dictionary object representing level of each tag

    :return: A list object containing the root element, tag counts & levels
    """
    tag_levels[root.tag] = level+1
    for child in root.getchildren():
        tag = child.tag
        if tag in tag_counts:
            tag_counts[tag] += 1
        else:
            tag_counts[tag] = 1
        gather_element_counts_levels(child, level+1, tag_counts, tag_levels)
    return [{"root": root.tag}, {"tag_count": tag_counts}, {"tag_levels": tag_levels}]


def get_XML_stats(xmltree):
    """
    This function will gather basic statistics for the input XML file. This includes
    getting the root tag, and finding all the tags being used with the levels & counts.
    :param filename: Input OSM XML file to parse
    :return: dictionary containing basic statistics like tag counts, root tag etc.
    """
    root = xmltree.getroot()
    data = gather_element_counts_levels(root)

    return data


def print_tags(xmltree):
    key_counts = {}
    for _, element in xmltree:
        if element.tag == "tag":
            key = element.attrib["k"]
            if key in key_counts.keys():
                key_counts[key] += 1
            else:
                key_counts[key] = 1
            if key.lower().startswith("shop"):
                print key, element.attrib["v"]
    return key_counts


In [4]:
#def test():
#    xmltree = ET.parse(FILENAME)
#    stats = get_XML_stats(xmltree)
#    pprint.pprint(stats)

    #xmltree = ET.iterparse(FILENAME)
    #tag_keys = print_tags(xmltree)
    #for key in tag_keys:
    #    print key, ' --- ', tag_keys.get(key)
    #print_tags(xmltree)

In [19]:
### Write data sets extracted from XML file to CSV files in DB like format


def write_csv(output_file, data, fields):
    with open(output_file, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=fields)
        writer.writeheader()
        for rec in data:
            try:
                writer.writerow(rec)
            except UnicodeEncodeError:
                for key in rec.keys():
                    rec[key] = rec[key].encode('utf-8')
                writer.writerow(rec)

In [6]:
xmltree = ET.parse(FILENAME)
stats = get_XML_stats(xmltree)
pprint.pprint(stats)

[{'root': 'osm'},
 {'tag_count': {'bounds': 1,
                'member': 22630,
                'meta': 1,
                'nd': 22210,
                'node': 20889,
                'note': 1,
                'relation': 273,
                'tag': 31897,
                'way': 2835}},
 {'tag_levels': {'bounds': 2,
                 'member': 3,
                 'meta': 2,
                 'nd': 3,
                 'node': 2,
                 'note': 2,
                 'osm': 1,
                 'relation': 2,
                 'tag': 3,
                 'way': 2}}]


In [7]:
xmltree = ET.iterparse(FILENAME)

nodes = []
node_tags =  []

ways = []
way_nodes = []
way_tags = []

relations = []
relation_members = []
relation_tags = []

x = set()

for _, elem in xmltree:
    if elem.tag == "node":
        node = elem.attrib
        nodes.append(node)
        for child in elem:
            record = {}
            attribs = child.attrib
            record["node_id"] = node["id"]
            record["key"] = attribs["k"]
            record["value"] = attribs["v"]
            node_tags.append(record)
        
    if elem.tag == 'way':
        way = elem.attrib
        ways.append(way)
        for child in elem:
            record_nodes = {}
            record_tags = {}
            if child.tag == "nd":
                record_nodes["way_id"] = way["id"]
                record_nodes["node_id"] = child.attrib["ref"]  
                way_nodes.append(record_nodes)
            if child.tag == "tag":
                k, v = child.attrib["k"], child.attrib["v"]
                record_tags["way_id"] = way["id"]
                record_tags["key"] = k
                record_tags["value"] = v
                way_tags += [record_tags] 
    if elem.tag == "relation":
        relation = elem.attrib
        relations.append(relation)
        for child in elem:
            record_members = {}
            record_tags = {}
            if child.tag == "member":
                attribs = child.attrib
                record_members["relation_id"] = relation["id"]
                record_members["type"] = attribs["type"]
                record_members["type_id"] = attribs["ref"]
                record_members["role"] = attribs["role"]
                relation_members.append(record_members)
            if child.tag == "tag":
                attribs = child.attrib
                record_tags["relation_id"] = relation["id"]
                record_tags["key"] = attribs["k"]
                record_tags["value"] = attribs["v"]
                relation_tags.append(record_tags)


In [8]:
print 'Total Nodes(<node>) :: ', len(nodes)   
print 'Total Node Tags (<node><tag>) :: ', len(node_tags)

print 'Total ways (<way>):: ', len(ways)
print 'Total way nodes (<way><nd>) :: ', len(way_nodes)
print 'Total way tags (<way><tag>) ::', len(way_tags)

print 'Total relations (<relation>):: ', len(relations)
print 'Total relation members (<relation><member>) :: ', len(relation_members)
print 'Total relation tags (<relation><tag>) ::', len(relation_tags)

Total Nodes(<node>) ::  20889
Total Node Tags (<node><tag>) ::  20619
Total ways (<way>)::  2835
Total way nodes (<way><nd>) ::  22210
Total way tags (<way><tag>) :: 9658
Total relations (<relation>)::  273
Total relation members (<relation><member>) ::  22630
Total relation tags (<relation><tag>) :: 1620


In [9]:
# Print nodes
for node in nodes[:5]:
    print '-'*20, 'Nodes', '-'*20
    pprint.pprint(node)
    for node_tag in node_tags:
        if node["id"] == node_tag["node_id"]:
            print '-'*20, 'Node Tags', '-'*20
            pprint.pprint(node_tag)
        if node["id"] < node_tag["node_id"]:
            break

-------------------- Nodes --------------------
{'changeset': '15601001',
 'id': '108189',
 'lat': '51.4949655',
 'lon': '-0.0999827',
 'timestamp': '2013-04-03T21:34:53Z',
 'uid': '1016290',
 'user': 'ika-chan!',
 'version': '8'}
-------------------- Nodes --------------------
{'changeset': '35949272',
 'id': '108190',
 'lat': '51.4956757',
 'lon': '-0.1007205',
 'timestamp': '2015-12-14T16:29:27Z',
 'uid': '322039',
 'user': 'MacLondon',
 'version': '8'}
-------------------- Node Tags --------------------
{'key': 'button_operated', 'node_id': '108190', 'value': 'yes'}
-------------------- Node Tags --------------------
{'key': 'crossing', 'node_id': '108190', 'value': 'traffic_signals'}
-------------------- Node Tags --------------------
{'key': 'crossing_ref', 'node_id': '108190', 'value': 'countdown'}
-------------------- Node Tags --------------------
{'key': 'highway', 'node_id': '108190', 'value': 'crossing'}
-------------------- Nodes --------------------
{'changeset': '3920403

In [10]:
# Print ways
for way in ways[:2]:
    print '-'*20, 'Ways', '-'*20
    pprint.pprint(way)
    for way_node in way_nodes:
        if way["id"] == way_node["way_id"]:
            print '-'*20, 'Way Nodes', '-'*20
            pprint.pprint(way_node)
        if way["id"] < way_node["way_id"]:
            break
    for way_tag in way_tags:
        if way["id"] == way_tag["way_id"]:
            print '-'*20, 'Way Tags', '-'*20
            pprint.pprint(way_tag)
        if way["id"] < way_tag["way_id"]:
            break
    print '\n'

-------------------- Ways --------------------
{'changeset': '33451074',
 'id': '2700324',
 'timestamp': '2015-08-19T21:45:52Z',
 'uid': '322039',
 'user': 'MacLondon',
 'version': '9'}
-------------------- Way Nodes --------------------
{'node_id': '276508', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '3705603828', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '25499035', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '566223283', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '1999869627', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '364309', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '3213191693', 'way_id': '2700324'}
-------------------- Way Nodes --------------------
{'node_id': '25499003', 'way_id': '2700324'}
-------------------- Way Nodes ----

In [11]:
# Print relations
for relation in relations[:1]:
    print '-'*20, 'Relations', '-'*20
    pprint.pprint(relation)
    for relation_member in relation_members:
        if relation["id"] == relation_member["relation_id"]:
            print '-'*20, 'Relation Members', '-'*20
            pprint.pprint(relation_member)
        if relation["id"] < relation_member["relation_id"]:
            break
    for relation_tag in relation_tags:
        if relation["id"] == relation_tag["relation_id"]:
            print '-'*20, 'Relation Tags', '-'*20
            pprint.pprint(relation_tag)
        if relation["id"] < relation_tag["relation_id"]:
            break
    print '\n'

-------------------- Relations --------------------
{'changeset': '5464489',
 'id': '2171',
 'timestamp': '2010-08-11T16:28:04Z',
 'uid': '346',
 'user': 'Tom Chance',
 'version': '2'}
-------------------- Relation Members --------------------
{'relation_id': '2171', 'role': 'outer', 'type': 'way', 'type_id': '8119005'}
-------------------- Relation Members --------------------
{'relation_id': '2171', 'role': 'inner', 'type': 'way', 'type_id': '8171114'}
-------------------- Relation Tags --------------------
{'key': 'type', 'relation_id': '2171', 'value': 'multipolygon'}




In [16]:
node_tags[0]

{'key': 'button_operated', 'node_id': '108190', 'value': 'yes'}

In [20]:
### Write data sets extracted from XML file to CSV files in DB like format

# Write Nodes data
fields = ['id', 'lat', 'lon', 'version', 'timestamp', 'changeset', 'uid', 'user']
output_file = '../output/nodes.csv'
write_csv(output_file, nodes, fields)

# Write Node Tags data
fields = ['node_id', 'key', 'value']
output_file = '../output/node_tags.csv'
write_csv(output_file, node_tags, fields)

In [22]:
# Write Ways data
fields = ['id', 'version', 'timestamp', 'changeset', 'uid', 'user']
output_file = '../output/ways.csv'
write_csv(output_file, ways, fields)

# Write Way Nodes data
fields = ['way_id', 'node_id']
output_file = '../output/way_nodes.csv'
write_csv(output_file, way_nodes, fields)

# Write Way Tags data
fields = ['way_id', 'key', 'value']
output_file = '../output/way_tags.csv'
write_csv(output_file, way_tags, fields)


In [23]:
# Write Relations data
fields = ['id', 'version', 'timestamp', 'changeset', 'uid', 'user']
output_file = '../output/relations.csv'
write_csv(output_file, relations, fields)
            
# Write Relation members data
fields = ['relation_id', 'type', 'type_id', 'role']
output_file = '../output/relation_members.csv'
write_csv(output_file, relation_members, fields)
            
# Write Relation Tags data
fields = ['relation_id', 'key', 'value']
output_file = '../output/relation_tags.csv'
write_csv(output_file, relation_tags, fields)

In [210]:
#### Audit nodes data 
data = {'changeset':[], 'uid':[], 'timestamp':[], 'lon':[], 'version':[], 'user':[], 'lat':[], 'id':[]}
for node in nodes:
    for key in node.keys():
        value = node[key]
        if key == 'timestamp':
            value = datetime.strptime(node[key], "%Y-%m-%dT%H:%M:%SZ")
        data[key].append(value)

