In [13]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "dc-cp"  # Replace this with your osm file
SAMPLE_FILE = "sample_k100.osm"

k = 100 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [11]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "sample.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "NW": "Northwest",
            "NE": "Northeast"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    print 'this are street types:', street_types
    return street_types


def update_name(name, mapping):
    m = street_type_re.search(name)
    if m.group() not in expected:
        if m.group() in mapping.keys():
            name = re.sub(m.group(), mapping[m.group()], name)
    return name
audit('sample.osm')

this are street types: defaultdict(<type 'set'>, {'Northeast': set(['Girard Street Northeast', 'D Street Northeast', 'Gentain Court Northeast', 'A Street Northeast', 'Randolph Place Northeast', 'L Street Northeast', 'I Street Northeast', 'Michigan Avenue Northeast', 'C Street Northeast', 'T Street Northeast', 'Lexington Place Northeast', 'Acker Place Northeast', 'U Street Northeast', 'Justice Court Northeast', 'East Capitol Street Northeast', 'Orleans Place Northeast', 'Evarts Street Northeast', '3rd Street Northeast', 'Todd Place Northeast', 'R Street Northeast', 'F Street Northeast', 'Morton Place Northeast', 'Summit Place Northeast', 'Abbey Place Northeast', 'M Street Northeast', 'Uhland Terrace Northeast', 'Ascot Place Northeast', 'Terrace Court Northeast', 'Eckington Place Northeast', 'Columbus Circle Northeast', 'Hawthorne Court Northeast', 'Maryland Avenue Northeast', 'Channing Street Northeast', 'Morris Place Northeast', 'North Capitol Street Northeast', '7th Street Northeast',

defaultdict(set,
            {'850': {'Connecticut Avenue Northwest Suite 850'},
             'NE': {'1st St NE', '6th St NE'},
             'NW': {'14th Street NW',
              '9th Street NW',
              'L St NW',
              'Vermont Ave NW'},
             'Northeast': {'1st Street Northeast',
              '2nd Street Northeast',
              '3rd Street Northeast',
              '4th Street Northeast',
              '5th Street Northeast',
              '6th Street Northeast',
              '7th Street Northeast',
              'A Street Northeast',
              'Abbey Place Northeast',
              'Acker Place Northeast',
              'Adams Street Northeast',
              'Ascot Place Northeast',
              'Bryant Street Northeast',
              'C Street Northeast',
              'Channing Street Northeast',
              'Columbus Circle Northeast',
              'Congress Street Northeast',
              'Constitution Avenue Northeast',
              'Cromw

In [25]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "NW": "Northwest",
            "NE": "Northeast",
            "850": ""
            }

def update_name(name, mapping):
    m = street_type_re.search(name)
    if m.group() not in expected:
        if m.group() in mapping.keys():
            name = re.sub(m.group(), mapping[m.group()], name)
    return name
update_name('Connecticut Avenue Northwest Suite 850', mapping)

'Connecticut Avenue Northwest Suite '

The above audit function outputs the last item in the address. Items that need to be updated include abbreviations like NW and NE. Also in some cases 2nd address was mistakenly included in the main address. I found Connecticut Avenue Northwest Suite 850, and 17th St NW, 9th floor. They need to be modified as well. 

In [17]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "dc-cp"

def audit_postcode(postcodes, postcode):
    #if len(postcode) != 5:
    postcodes[postcode].add(postcode)


def is_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    postcodes = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_postcode(tag):
                    audit_postcode(postcodes, tag.attrib['v'])
    osm_file.close()
    print 'this are street types:', postcodes
    return postcodes

audit(OSMFILE)

this are street types: defaultdict(<type 'set'>, {'20005-7700': set(['20005-7700']), '20910': set(['20910']), '20737': set(['20737']), '20912': set(['20912']), '20422': set(['20422']), '20011-6927': set(['20011-6927']), '20006': set(['20006']), '20710': set(['20710']), '20010': set(['20010']), '20011': set(['20011']), '20012': set(['20012']), '20542': set(['20542']), '20420': set(['20420']), '20017': set(['20017']), '20018': set(['20018']), '20019': set(['20019']), '20401': set(['20401']), '20548': set(['20548']), '20005-1019': set(['20005-1019']), '20005-1013': set(['20005-1013']), '207842': set(['207842']), '20052': set(['20052']), '20024': set(['20024']), '2011': set(['2011']), '20740': set(['20740']), '20903': set(['20903']), '20742': set(['20742']), '20901': set(['20901']), '20712': set(['20712']), '20781': set(['20781']), '20005': set(['20005']), '20783': set(['20783']), '20002': set(['20002']), '20001': set(['20001']), '20260': set(['20260']), 'DC 20002': set(['DC 20002']), '200

defaultdict(set,
            {'20001': {'20001'},
             '20002': {'20002'},
             '20005': {'20005'},
             '20005-1009': {'20005-1009'},
             '20005-1013': {'20005-1013'},
             '20005-1019': {'20005-1019'},
             '20005-4111': {'20005-4111'},
             '20005-7700': {'20005-7700'},
             '20006': {'20006'},
             '20008': {'20008'},
             '20009': {'20009'},
             '20010': {'20010'},
             '20011': {'20011'},
             '20011-6927': {'20011-6927'},
             '20012': {'20012'},
             '20017': {'20017'},
             '20018': {'20018'},
             '20019': {'20019'},
             '20024': {'20024'},
             '2005': {'2005'},
             '20052': {'20052'},
             '20060': {'20060'},
             '20064': {'20064'},
             '2011': {'2011'},
             '20242': {'20242'},
             '20260': {'20260'},
             '20401': {'20401'},
             '20420': {'20420'},
   

Problems encountered:

Erroneous Zipcodes: It looks like there are some zip code with 4 or 6 characters. One example is 207842. By looking up the address on Google Maps, I found out the corresponding zipcode for the address is 20784. Since there were only a few of these, I was able to manually find them on the map and grab the correct zip code. Then in the update function, I looked for the wrong zip codes in the dataset and replaced them using a list that mapped to the right zip codes.

Zip codes starting with State: DC 20002

Zip codes followed by dash: These are not necessarily wrong, but since we want them to be uniform, I removed the extra digits.

Note: since I ran the audit against the entire data-set (and not just the sample) we are confident we are not missing anything. Otherwise, we wouldve been worried about potential problems not caught in the sample.

In [12]:
postcode_mapping = {'DC 20002': '20002',
                    '2011': '20011',
                    '2005': '20005',
                    '207842': '20784'}
def update_postcode(postcode):
    if postcode in postcode_mapping.keys():
        return postcode_mapping[postcode]
    elif '-' in postcode:
        return s.split('-')[0]
    else:
        return postcode

In [18]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "dc-cp"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    
    if element.tag == 'node':
        for attrib in element.attrib:
            if attrib in NODE_FIELDS:
                node_attribs [attrib] = element.attrib[attrib]
        for child in element:
            if PROBLEMCHARS.match(child.attrib["k"]):
                continue
            elif LOWER_COLON.match(child.attrib["k"]):
                sec_tag = {}
                sec_tag["type"] = child.attrib["k"].split(":",1)[0]
                sec_tag["key"] = child.attrib["k"].split(":",1)[1]
                sec_tag["id"] = element.attrib["id"]
                #sec_tag["value"] = child.attrib["v"]
                if sec_tag["key"] == 'postcode':
                    sec_tag["value"] = update_postcode(child.attrib["v"])
                else:
                    sec_tag["value"] = child.attrib["v"]
                tags.append(sec_tag)
            else:
                sec_tag = {}
                sec_tag["type"] = "regular"
                sec_tag["key"] = child.attrib["k"]
                sec_tag["id"] = element.attrib["id"]
                sec_tag["value"] = child.attrib["v"]
                tags.append(sec_tag)
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        for attrib in element.attrib:
            if attrib in WAY_FIELDS:
                way_attribs [attrib] = element.attrib[attrib]
        for i in range(len(element)):
            if element[i].tag == "tag":
                if PROBLEMCHARS.match(element[i].attrib["k"]):
                    continue
                elif LOWER_COLON.match(element[i].attrib["k"]):
                    sec_tag = {}
                    sec_tag["type"] = element[i].attrib["k"].split(":",1)[0]
                    sec_tag["key"] = element[i].attrib["k"].split(":",1)[1]
                    sec_tag["id"] = element.attrib["id"]
                    if sec_tag["key"] == 'postcode':
                        sec_tag["value"] = update_postcode(element[i].attrib["v"])
                    else:
                        element[i].attrib["v"]
                    tags.append(sec_tag)
                else:
                    sec_tag = {}
                    sec_tag["type"] = "regular"
                    sec_tag["key"] = element[i].attrib["k"]
                    sec_tag["id"] = element.attrib["id"]
                    sec_tag["value"] = element[i].attrib["v"]
                    tags.append(sec_tag)
            elif element[i].tag == "nd":
                sec_tag = {}
                sec_tag["id"] = element.attrib["id"]
                sec_tag["node_id"] = element[i].attrib["ref"]
                sec_tag["position"] = i
                way_nodes.append(sec_tag)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False) #turned off on 7/16/17


In [None]:
def update_name(name):

    # your code
    return name

def shape_element(element):
    #your code

    #last problem version
    node["address"]["street"] = tag.attrib['v']

    #project version with update_name
    node["address"]["street"] = update_name(tag.attrib['v'])
 
   #your code
   return node