# OpenStreetMap Data Case Study
### Map Area: Dallas, United States

https://mapzen.com/data/metro-extracts/metro/dallas_texas/

Having lived in the Dallas area for over 15 years, I am very interested in working with this region.

This file contains the code (mostly derived from the lesson examples) to extract the various CSV files for import into the SQL Database.

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus
import pandas as pd

import schema

SCHEMA = {
    'node': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'lat': {'required': True, 'type': 'float', 'coerce': float},
            'lon': {'required': True, 'type': 'float', 'coerce': float},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'node_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    },
    'way': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'way_nodes': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'node_id': {'required': True, 'type': 'integer', 'coerce': int},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'way_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    }
}


OSM_PATH = "F:\OneDrive\Documents\Udacity\Data Analysis\OpenStreetMap\DallasOSM\dallas_texas.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

OSM_FILE = "F:\OneDrive\Documents\Udacity\Data Analysis\OpenStreetMap\DallasOSM\dallas_texas.osm"  # Replace this with your osm file
SAMPLE_FILE = "./sample.osm"



def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_path, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()




def count_tags(filename):
    tags = {}
    
    for event, elem in ET.iterparse(filename):
        if elem.tag not in tags:
            tags[elem.tag] = 1
        else:
            tags[elem.tag] += 1
    
    return tags

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""
#    print("new element")
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for field in node_attr_fields:
            node_attribs.update({field : element.attrib[field]})

        for tag in element.iter("tag"):
            if not PROBLEMCHARS.search(tag.attrib['k']):
                tag_id = element.attrib['id']
                tag_value = tag.attrib['v']
                if LOWER_COLON.search(tag.attrib['k']):
                    first_colon = tag.attrib['k'].find(":")
                    tag_type = tag.attrib['k'][:first_colon]
                    tag_key = tag.attrib['k'][first_colon+1:]
                else:
                    tag_type = default_tag_type
                    tag_key = tag.attrib['k']
            
                tags.append({
                    'id' : tag_id,
                    'type' : tag_type,
                    'key' : tag_key,
                    'value' : tag_value
                })

        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        for field in way_attr_fields:
            way_attribs.update({field : element.attrib[field]})

        count_nodes = 0
            
        for tag in element.iter("nd"):
            way_nodes.append({
                'id' : element.attrib['id'],
                'node_id' : tag.attrib['ref'],
                'position' : count_nodes
            })
            count_nodes += 1
        for tag in element.iter("tag"):
            if not PROBLEMCHARS.search(tag.attrib['k']):
                tag_id = element.attrib['id']
                tag_value = tag.attrib['v']
                if LOWER_COLON.search(tag.attrib['k']):
                    first_colon = tag.attrib['k'].find(":")
                    tag_type = tag.attrib['k'][:first_colon]
                    tag_key = tag.attrib['k'][first_colon+1:]
                else:
                    tag_type = default_tag_type
                    tag_key = tag.attrib['k']
                    
                tags.append({
                    'id' : tag_id,
                    'type' : tag_type,
                    'key' : tag_key,
                    'value' : tag_value
                })

        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}



# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)



# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

#    with codecs.open(NODES_PATH, 'w', encoding='utf8') as nodes_file, \
#         codecs.open(NODE_TAGS_PATH, 'w', encoding='utf8') as nodes_tags_file, \
#         codecs.open(WAYS_PATH, 'w', encoding='utf8') as ways_file, \
#         codecs.open(WAY_NODES_PATH, 'w', encoding='utf8') as way_nodes_file, \
#         codecs.open(WAY_TAGS_PATH, 'w', encoding='utf8') as way_tags_file:

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])



def create_sample(k):
    with open(SAMPLE_FILE, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n'.encode())
        output.write('<osm>\n  '.encode())
    
        # Write every kth top level element
        for i, element in enumerate(get_element(OSM_FILE)):
            if i % k == 0:# Execute this to create a new sample file
                output.write(ET.tostring(element, encoding='utf-8'))
     
        output.write('</osm>'.encode())
    
    return()

In [2]:
# Execute this to create a new sample file
create_sample_test = True

In [44]:
# Execute this in order to NOT create a new sample file (i.e. when troubleshooting)
create_sample_test = False

In [3]:
if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    
    if create_sample:
        create_sample(35)
    
    process_map(SAMPLE_FILE, validate=True)