## Part I. Iterative Parsing

In [1]:
import xml.etree.ElementTree as ET
import pprint
from collections import defaultdict

In [2]:
def count_tags(filename):
#         counts = dict()
#         for line in ET.iterparse(filename):
#             current = line[1].tag
#             counts[current] = counts.get(current, 0) + 1
    counts = defaultdict(int)
    for line in ET.iterparse(filename):
        current = line[1].tag
        counts[current] += 1
    return counts

In [4]:
def test():

    tags = count_tags('example.osm.xml')
    pprint.pprint(tags)
    assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}

    

if __name__ == "__main__":
    test()

defaultdict(<class 'int'>,
            {'bounds': 1,
             'member': 3,
             'nd': 4,
             'node': 20,
             'osm': 1,
             'relation': 1,
             'tag': 7,
             'way': 1})


## Part II. Tag Types

In [5]:
import xml.etree.ElementTree as ET
import pprint
import re

In [6]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [7]:
def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value) is not None:
            keys['lower'] += 1
        elif lower_colon.search(k_value) is not None:
            keys['lower_colon'] += 1
        elif problemchars.search(k_value) is not None:
            keys["problemchars"] += 1
        else:
            keys['other'] += 1

    return keys

In [8]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [10]:
def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertions will be incorrect then.
    keys = process_map('example.osm.xml')
    pprint.pprint(keys)
    #assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

{'lower': 7, 'lower_colon': 0, 'other': 0, 'problemchars': 0}


## Part III. Exploring Users

In [11]:
import xml.etree.ElementTree as ET
import pprint
import re

In [12]:
def get_user(element):
    return

In [13]:
def process_map(filename):
    users = set()
    for not_used, element in ET.iterparse(filename):
        #print "TAG:", element.tag
        #pprint.pprint(element.attrib)
        if element.tag == "node" or element.tag == "way" or element.tag == "relation":
            users.add(element.attrib['uid'])
            #pprint.pprint(element.attrib['uid'])

    return users

In [15]:
def test():

    users = process_map('example.osm.xml')
    pprint.pprint(users)
    assert len(users) == 6
    
if __name__ == "__main__":
    test()

{'26299', '1219059', '451048', '147510', '939355', '567034'}


## Part IV. Improving Street Names

In [16]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

In [18]:
OSMFILE = "example.osm.xml"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
            "W.": "West",
            "N.": "North",
            "S.": "South",
            "E": "East"}

In [19]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [20]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [32]:
def audit(osmfile):
    """
    Returns a list of problematic street type values
    for use with the update() name mapping.
    """
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    parser = ET.iterparse(osm_file, events=("start",))
    for event, elem in parser:
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
        # Safe to clear() now that descendants have been accessed
        elem.clear()
    del parser
    return street_types

In [29]:
def update(name, mapping):
    """
    Implemented in data.py
    Updates ALL substrings in string 'name' to
    their values in dictionary 'mapping'
    """
    words = name.split()
    for w in range(len(words)):
        if words[w] in mapping:
            if words[w-1].lower() not in ['suite', 'ste.', 'ste']: # For example, don't update 'Suite E' to 'Suite East'
                words[w] = mapping[words[w]]
    name = " ".join(words)
    return name

In [30]:
def update_name(name, mapping):
    """
    If the last substring of string 'name' is an int,
    updates all substrings in 'name', else updates
    only the last substring.
    """
    m = street_type_re.search(name)
    m = m.group()
    # Fix all substrings in an address ending with a number.
    # Example: 'S Tryon St Ste 105' to 'South Tryon Street Suite 105'
    try:
        __ = int(m)
        words = name.split()[:-1]
        for w in range(len(words)):
            if words[w] in mapping:
                words[w] = mapping[words[w]]
        words.append(m)
        address = " ".join(words)
        return address
    # Otherwise, fix only the last substring in the address
    # Example: 'This St.' to 'This Street'
    except ValueError:        
        i = name.index(m)
        if m in mapping:
            name = name[:i] + mapping[m]
    return name


In [34]:
def main_test():
    st_types = audit("charlotte.osm")
    assert len(st_types) == 19
    pprint.pprint(dict(st_types))
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update(name, mapping)
            print (name, "=>", better_name)
            if name == "West Stanly St.":
                assert better_name == "West Stanly Street"
            if name == "S Tryon St Ste 105":
                assert better_name == "South Tryon Street Suite 105"

In [39]:
def example_test():
    st_types = audit("example.osm.xml")
    #assert len(st_types) == 6
    pprint.pprint(dict(st_types))
    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update(name, mapping)
            print (name, "=>", better_name)
            if name == "Winthrop Ave":
                assert better_name == "Winthrop Avenue"
            if name == "W 9th St":
                assert better_name == "West 9th Street"


if __name__ == '__main__':
    example_test()

{}
