In [1]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema


In [32]:
OSM_PATH = "G:\\UdacityDataAnalyst\\shanghai_china.osm"

NODES_PATH = "G:\\nodes.csv"
NODE_TAGS_PATH = "G:\\nodes_tags.csv"
WAYS_PATH = "G:\\ways.csv"
WAY_NODES_PATH = "G:\\ways_nodes.csv"
WAY_TAGS_PATH = "G:\\ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema


In [5]:
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [3]:
mapping = { 
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
            'Hwy.': "Highway",
            "Lu": "Road",
            "lu": "Road",
            "Rd.": "Road",
            "Rd)": "Road",
            "Rd.）": "Road",
            "Rode" : "Road",
            "rd": "Road",
            "road": "Road",
            "Rd," : "Road,",
            "\n": "",
            "\r\n":""
            }

In [6]:
# 无子节点
def no_sub_tag(element):
    if len(element.getchildren()) == 0:
        return True
    else:
        return False

    
# fix the node tag into a node_tag dict
def fix_node_tag(element):
    # print(element.attrib['id'])
    node_tags = []
    
    node_tags_iter = element.iter('tag')
    
    for i in node_tags_iter:
        node_tags_dict = {}
        # print(i.attrib['k'])
        k_data = i.attrib['k']
        value_data = update_name(i.attrib['v'],mapping)
        if PROBLEMCHARS.search(k_data):
            print("PROBLEMCHARS",k_data)
            pass
        elif LOWER_COLON.match(k_data):
            node_tags_dict['key'] = k_data.split(':',1)[1]
            node_tags_dict['type'] = k_data.split(':',1)[0]
            node_tags_dict['value'] = value_data
            node_tags_dict['id'] = element.attrib['id']
            node_tags.append(node_tags_dict)
        else:
            node_tags_dict['key'] = k_data
            node_tags_dict['type'] = 'regular'
            node_tags_dict['value'] = value_data
            node_tags_dict['id'] = element.attrib['id']
            node_tags.append(node_tags_dict)
    return node_tags


# fix the name of street
def update_name(name, mapping):

    # YOUR CODE HERE
    shortname = mapping.keys()
    for word in shortname:
        if word in name:
            name = name.replace(word,mapping[word])
    return name


# return a way_nodes dict   
def fix_way_nodes(element):
    way_nodes = []
    
    way_nodes_iter = element.iter('nd')
    n = -1
    for i in way_nodes_iter:
        way_node_dict = {}
        n+=1
        way_node_dict["id"] = element.attrib['id']
        way_node_dict['node_id'] = i.attrib['ref']
        way_node_dict['position'] = n
        way_nodes.append(way_node_dict)
    # print(way_nodes)
    return way_nodes


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        if no_sub_tag(element):
            for k in node_attr_fields:
                node_attribs[k] = element.attrib[k]
        elif not no_sub_tag(element):
            for k in node_attr_fields:
                node_attribs[k] = element.attrib[k]
            tags = fix_node_tag(element)
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        if no_sub_tag(element):
            for k in way_attr_fields:
                way_attribs[k] = element.attrib[k]
        elif not no_sub_tag(element):
            for k in way_attr_fields:
                way_attribs[k] = element.attrib[k]
            tags = fix_node_tag(element)
            way_nodes = fix_way_nodes(element)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


## Helper Function

In [43]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v if isinstance(v, str) else v) for k, v in row.items()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [44]:
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w', encoding='utf-8') as nodes_file,\
        codecs.open(NODE_TAGS_PATH, 'w',encoding='utf-8') as nodes_tags_file,\
        codecs.open(WAYS_PATH, 'w',encoding='utf-8') as ways_file,\
        codecs.open(WAY_NODES_PATH, 'w',encoding='utf-8') as way_nodes_file,\
        codecs.open(WAY_TAGS_PATH, 'w',encoding='utf-8') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
        

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    #validate_element(el, validator)
                    pass

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])



In [45]:
if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)


PROBLEMCHARS dss created
PROBLEMCHARS Sanzhi Jie
