In [4]:
#https://classroom.udacity.com/courses/ud032/lessons/768058569/concepts/8443086480923
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the 
tag name as the key and number of times this tag can be encountered in 
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""



#Downloaded OSM file, using ElementTree to parse through it
import xml.etree.ElementTree as ET  
import pprint
inputfile = 'sample.xml'
tags = {}
#Count number# of element Types
def count_tags(inputfile):
    #Reads XML file: sample.xml and count each XML tag within the document
    
    for _, elemt in ET.iterparse(inputfile):
        tag = elemt.tag
        if tag not in tags:
            tags[tag] = 1
        else:
            tags[tag] += 1
    #returns a Dictionary consist of tag_names:(counts as values)
        
    return tags    

pprint.pprint(count_tags(inputfile))

{'bounds': 1,
 'member': 23942,
 'meta': 1,
 'nd': 668786,
 'node': 556381,
 'note': 1,
 'osm': 1,
 'relation': 569,
 'tag': 210698,
 'way': 65070}


In [22]:
from PIL import Image
im = Image.open("Melbourne.png")
im.show()

In [5]:
#identifying unique users contribute the to the map area: Melbouren, FL. 
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def process_map(inputfile):  
    users = set()
    for _, element in ET.iterparse(inputfile):
        for e in element:
            if 'uid' in e.attrib:
                users.add(e.attrib['uid'])

    return users
#the function process_map return a set of unique user "uid"
users = process_map(inputfile)  
print('Number of unique users:', len(users)) 

Number of unique users: 797


In [6]:
#https://classroom.udacity.com/courses/ud032/lessons/768058569/concepts/8402186170923
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""

import re

lower_case = re.compile(r'^([a-z]|_)*$')  
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')  
prob_chars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    #tag's attribute matches a regular express and counts # tags.
    if element.tag == "tag":
        if prob_chars.search(element.attrib['k']):
            keys['prob_chars'] +=1
            #print element.attrib['k']
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon'] +=1
        elif lower_case.search(element.attrib['k']):
            keys['lower_case'] +=1    
        else: 
            keys['other'] +=1
        
        
    return keys
   

def process_map(inputfile):  
    keys = {"lower_case": 0, "lower_colon": 0, "prob_chars": 0, "other": 0}
    # Iterates through an XML file and create a Dict of keys/count.

    for _, element in ET.iterparse(inputfile):
        keys = key_type(element, keys)

    return keys

keys = process_map(inputfile)  
pprint.pprint(keys)  

{'lower_case': 140764, 'lower_colon': 66274, 'other': 3660, 'prob_chars': 0}


In [7]:
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected_street = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons", "Circle", "Gate", "Heights", "Park", "Way", "Wood", "Path", "Terrace"]


In [8]:
#https://classroom.udacity.com/courses/ud032/lessons/768058569/concepts/8755386140923
#https://docs.python.org/3/howto/regex.html
def audit_street_type(street_types, street_name, regex, expected_street):  
    m = regex.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected_street:
            street_types[street_type].add(street_name)

In [9]:
def is_street_name(elem):  
    return (elem.attrib['k'] == "addr:street")

In [10]:
#https://classroom.udacity.com/courses/ud032/lessons/768058569/concepts/8755386140923
def audit(inputfile, regex):  
    inputfile = open(inputfile, "r")
    street_types = defaultdict(set)

    # iteratively parse the mapping xml
    for event, elem in ET.iterparse(inputfile, events=("start",)):
       #Filtering: the Tag property for this element object has to be "way"
        if elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'], regex, expected_street)

    return street_types
street_types = audit(inputfile, street_type_re)

pprint.pprint(dict(street_types)) 

{'1': {'US 1'},
 'A1A': {'2055 Highway A1A',
         'Highway A1A',
         'N Hwy A1A',
         'North State Route A1A'},
 'Ave': {'Pineapple Ave', 'W New Haven Ave'},
 'BLVD': {'Bayside Lakes BLVD'},
 'Blvd': {'Caribbean Isle Blvd',
          'Hampton Greens Blvd',
          'Jamaica Blvd',
          'Lake Crest Blvd',
          'N Harbor City Blvd',
          'S Fiske Blvd',
          'Viera Blvd',
          'W Eau Gallie Blvd',
          'W NASA Blvd'},
 'Cir': {'Flower Tree Cir', 'Kenwood Cir'},
 'Ct': {'E Azalea Ct',
        'Ellen Ct',
        'Mimosa Ct',
        'Quayside Ct',
        'W Azalea Ct',
        'Webster Ct',
        'Wedge Ct',
        'Wentworth Ct',
        'Westfield Ct',
        'Westview Ct',
        'Westwind Ct',
        'Windover Ct',
        'Windsor Ct',
        'Wycliff Ct'},
 'D': {'E Avenue D'},
 'Dr': {'Central Park Dr',
        'College Wood Dr',
        'Cozumel Dr',
        'Croftwood Dr',
        'E Riverside Dr',
        'Eagle Dr',
        '

In [11]:
#https://docs.python.org/3/howto/regex.html
#Scan through a string, looking for any location where this RE matches.
def update_name(name, mapping, regex):  
    m = regex.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = re.sub(regex, mapping[street_type], name)

    return name

In [12]:
street_type_mapping = {'Ave'  : 'Avenue',  
                       'Blvd' : 'Boulevard',
                       'Dr'   : 'Drive',
                       'Ln'   : 'Lane',
                       'Pkwy' : 'Parkway',
                       'Rd'   : 'Road',
                       'St'   : 'Street'}
                       

In [13]:
street_type_re  = re.compile(r'\b\S+\.?$', re.IGNORECASE)  

In [14]:
for street_type, ways in street_types.items():  
    for name in ways:
        better_name = update_name(name, street_type_mapping, street_type_re)
        print(name, "=>", better_name)

Pineapple Ave => Pineapple Avenue
W New Haven Ave => W New Haven Avenue
Post Rd => Post Road
N Wickham Rd => N Wickham Road
Sarno Rd => Sarno Road
Jamaica Blvd => Jamaica Boulevard
Caribbean Isle Blvd => Caribbean Isle Boulevard
N Harbor City Blvd => N Harbor City Boulevard
W Eau Gallie Blvd => W Eau Gallie Boulevard
Lake Crest Blvd => Lake Crest Boulevard
W NASA Blvd => W NASA Boulevard
Hampton Greens Blvd => Hampton Greens Boulevard
Viera Blvd => Viera Boulevard
S Fiske Blvd => S Fiske Boulevard
Steeplechase Ln => Steeplechase Lane
Fire Fall Ln => Fire Fall Lane
Mollie Ln => Mollie Lane
Breeze Ln => Breeze Lane
Sanddune Ln => Sanddune Lane
Hampton Park Ln => Hampton Park Lane
Patty Ln => Patty Lane
Bogey Ln => Bogey Lane
Sand Trap Ln => Sand Trap Lane
Rhonda Ln => Rhonda Lane
Plantation Club Dr => Plantation Club Drive
Croftwood Dr => Croftwood Drive
Merion Dr => Merion Drive
E Riverside Dr => E Riverside Drive
College Wood Dr => College Wood Drive
Equinox Dr => Equinox Drive
Oakwood

In [34]:
street_type_pre = re.compile(r'^[NSEW]\b\.?', re.IGNORECASE)  

In [35]:
cardinal_directions = audit(inputfile, street_type_pre)

pprint.pprint(dict(cardinal_directions))  


{'E': {'E Eau Gallie Blvd', 'E Riverside Dr', 'E Avenue D', 'E Azalea Ct'},
 'N': {'N Babcock St',
       'N Harbor City Blvd',
       'N Highway A1A',
       'N Hwy A1A',
       'N Wickham Rd'},
 'S': {'S Fiske Blvd',
       'S Harbor City Blvd #328',
       'S Hwy A1A',
       'S Patrick Dr',
       'S Wickham Rd'},
 'W': {'W NASA Blvd', 'W Azalea Ct', 'W New Haven Ave'}}


In [38]:
cardinal_mapping = {'E'  : 'East',  
                    'E.' : 'East',
                    'N'  : 'North',
                    'N.' : 'North',
                    'S'  : 'South',
                    'S.' : 'South',
                    'W'  : 'West',
                    'W.' : 'West'}

In [41]:
for cardinal_direction, ways in cardinal_directions.items():  
    if cardinal_direction in cardinal_mapping:
        for name in ways:
            better_name = update_name(name, street_type_mapping, street_type_re)
            best_name   = update_name(better_name, cardinal_mapping, street_type_pre)
            print(name, "=>", better_name, "=>", best_name)

N Harbor City Blvd => N Harbor City Boulevard => North Harbor City Boulevard
N Hwy A1A => N Hwy A1A => North Hwy A1A
N Wickham Rd => N Wickham Road => North Wickham Road
N Highway A1A => N Highway A1A => North Highway A1A
N Babcock St => N Babcock Street => North Babcock Street
E Eau Gallie Blvd => E Eau Gallie Boulevard => East Eau Gallie Boulevard
E Riverside Dr => E Riverside Drive => East Riverside Drive
E Avenue D => E Avenue D => East Avenue D
E Azalea Ct => E Azalea Ct => East Azalea Ct
W NASA Blvd => W NASA Boulevard => West NASA Boulevard
W Azalea Ct => W Azalea Ct => West Azalea Ct
W New Haven Ave => W New Haven Avenue => West New Haven Avenue
S Harbor City Blvd #328 => S Harbor City Blvd #328 => South Harbor City Blvd #328
S Wickham Rd => S Wickham Road => South Wickham Road
S Fiske Blvd => S Fiske Boulevard => South Fiske Boulevard
S Hwy A1A => S Hwy A1A => South Hwy A1A
S Patrick Dr => S Patrick Drive => South Patrick Drive


In [51]:
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
  process it in a way that you feel is best. For example, you might split it into a two-level
  dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""

from datetime import datetime
import json  
from bson import json_util

CREATED = ["version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):  
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag

        
        for attrib in element.attrib:

            
            if attrib in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                if attrib == 'timestamp':
                    node['created'][attrib] = datetime.strptime(element.attrib[attrib], '%Y-%m-%dT%H:%M:%SZ')
                else:
                    node['created'][attrib] = element.get(attrib)

            
            if attrib in ['lat', 'lon']:
                lat = float(element.attrib.get('lat'))
                lon = float(element.attrib.get('lon'))
                node['pos'] = [lat, lon]

            
            else:
                node[attrib] = element.attrib.get(attrib)

        
        for tag in element.iter('tag'):
            key   = tag.attrib['k']
            value = tag.attrib['v']
            if not prob_chars.search(key):

                
                if lower_colon.search(key) and key.find('addr') == 0:
                    if 'address' not in node:
                        node['address'] = {}
                    sub_attr = key.split(':')[1]
                    if is_street_name(tag):
                        # Do some cleaning
                        better_name = update_name(name, street_type_mapping, street_type_re)
                        best_name   = update_name(better_name, cardinal_mapping, street_type_pre)
                        node['address'][sub_attr] = best_name
                    else:
                        node['address'][sub_attr] = value

                
                elif not key.find('addr') == 0:
                    if key not in node:
                        node[key] = value
                else:
                    node["tag:" + key] = value

        
        for nd in element.iter('nd'):
            if 'node_refs' not in node:
                node['node_refs'] = []
            node['node_refs'].append(nd.attrib['ref'])

        return node
    else:
        return None


def process_map(inputfile, pretty = False):  
    file_out = "{0}.json".format(inputfile)
    with open(file_out, "w") as fo:
        for _, element in ET.iterparse(inputfile):
            el = shape_element(element)
            if el:
                if pretty:
                    fo.write(json.dumps(el, indent=2, default=json_util.default)+"\n")
                else:
                    fo.write(json.dumps(el, default=json_util.default) + "\n")
                    

process_map(inputfile)  

In [53]:
import os  
print('The downloaded file is {} MB'.format(os.path.getsize(inputfile)/1.0e6)) 


The downloaded file is 123.241036 MB


In [54]:
print('The json file is {} MB'.format(os.path.getsize(inputfile + ".json")/1.0e6))   

The json file is 211.0347 MB
