### Create sample file from initial OSM file download

In [85]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#########################################
# Create a sample file for initial study
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "san-francisco_california.osm"  # Replace this with your osm file
SAMPLE_FILE = "sf_sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

### Initial study of the OSM file

In [1]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re
import string
import codecs
import json

#from sets import Set


    
###################################
# Study components of the OSM file

dist_tags = defaultdict(int)
# Count instance of distinct tags
def count_tags(elem,elem_tag):
    
    if elem.tag == elem_tag:
        
        for elements in elem.iter(None):
            #print elements.tag

            dist_tags[elements.tag] +=1
           
    return dist_tags




# return counts of values for tag attribute

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

alltags={}
alltags["lower"]= defaultdict(int)
alltags["lower_colon"]= defaultdict(int)
alltags["problemchars"]= defaultdict(int)
alltags["others"]= defaultdict(int)

def tag_values(event,elem,elem_tag):
    if elem.tag ==elem_tag:
        for elements in elem.iter('tag'): 
            if re.match(lower,elements.attrib['k']):
                alltags["lower"][elements.attrib['k']]+=1
            elif re.match(lower_colon,elements.attrib['k']):
                alltags["lower_colon"][elements.attrib['k']]+=1         
            elif re.match(problemchars,elements.attrib['k']):
                alltags["problemchars"][elements.attrib['k']]+=1 
            else:
                 alltags["others"][elements.attrib['k']]+=1
            

    return alltags






# return counts of values for tag attribute



zip_val= defaultdict(set)

def zip_values(event,elem,elem_tag):
    if elem.tag == elem_tag:
        for elements in elem.iter('tag'):
            if 'zip' in elements.attrib['k'] and len(elements.attrib['v']) > 6:
                zip_val[elements.attrib['k']].add( elements.attrib['v']  )         
    return zip_val


    
# Check Address values
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Real", "Terrace", "Way", "Circle","Highway"]

pattern = re.compile(r'\b\S+\.?$', re.IGNORECASE)

address = defaultdict(set)

def address_values(elem,elem_tag):
    #addr= defaultdict(set)
    if elem.tag == elem_tag:
        for elements in elem.iter('tag'): 
            if elements.attrib['k'] == 'addr:street':
                m = pattern.search(elements.attrib['v'])
                if m:
                    addr_type = m.group()                
                    if addr_type not in expected:
                        address[addr_type].add(elements.attrib['v'] ) 
    #print address
    return address
        
        




# sort by value and print top 20 sorted list
def print_sorted_list(d, message):
    a = sorted(d.iteritems(), key = lambda (k,v): (-v,k)) 
    for i,key in enumerate(a):
        if i<21:
             print i,key

    


    
# Read and Parse the osm file 
counter=0
n_tags={}
ktag_values = {}
address_val = {}
county_list = {}



def read_file(filename,elem_tag):
    global counter 
    global n_tags 
    global ktag_values 
    global address_val 
    global county_list
    for event, elem in ET.iterparse(filename):        

        n_tags =count_tags(elem,elem_tag)  
        ktag_values = tag_values(event,elem,elem_tag)
        address_val = address_values(elem,elem_tag)
        zip_val= zip_values(event,elem,elem_tag)
        
         # Check distinct county names and determine any inconsistensies
     

        # Check county values
        if counter==0:
            county_list=defaultdict(set)

        if elem.tag == elem_tag:       
            for elements in elem.iter('tag'):
                if ':county' in elements.attrib['k']:            

                    county_list[elements.attrib['k']].add(elements.attrib['v'])
        
        
            counter+=1       
 
    
    

    
        

    
if __name__ == "__main__":
     read_file('sf_sample.osm','way')

### Check resulting value from initial study

In [3]:
# Print Distinct upper level tags       
print
print "Ct Distinct upper-level tag : ", n_tags
    
    


Ct Distinct upper-level tag :  defaultdict(<type 'int'>, {'tag': 161093, 'nd': 667859, 'way': 68095})


In [7]:
# Count of different types of keys
print
for k in ktag_values.keys():
    print '--------------'
    print 'values of keys that are ', k
    print_sorted_list(ktag_values[k], 'Top 20 k values')
 


--------------
values of keys that are  problemchars
--------------
values of keys that are  lower
0 ('highway', 2585)
1 ('name', 1771)
2 ('created_by', 1280)
3 ('amenity', 1275)
4 ('source', 791)
5 ('crossing', 744)
6 ('shop', 483)
7 ('natural', 400)
8 ('taxon', 297)
9 ('ele', 258)
10 ('power', 246)
11 ('cuisine', 244)
12 ('operator', 228)
13 ('railway', 224)
14 ('website', 218)
15 ('phone', 170)
16 ('leaf_cycle', 164)
17 ('leaf_type', 152)
18 ('barrier', 140)
19 ('traffic_calming', 140)
20 ('emergency', 130)
--------------
values of keys that are  lower_colon
0 ('addr:housenumber', 2430)
1 ('addr:street', 2192)
2 ('addr:city', 1804)
3 ('addr:postcode', 740)
4 ('addr:state', 630)
5 ('addr:country', 396)
6 ('species:en', 291)
7 ('redwood_city_ca:addr_id', 288)
8 ('gnis:feature_id', 222)
9 ('gnis:created', 192)
10 ('gnis:county_id', 182)
11 ('gnis:state_id', 182)
12 ('seamark:type', 59)
13 ('gnis:county_name', 41)
14 ('survey:date', 38)
15 ('traffic_signals:sound', 33)
16 ('gnis:import

In [8]:
  # print Address types with address values as arrays
print    
print "Address Types: " 
pprint.pprint(address_val) 



Address Types: 
defaultdict(<type 'set'>, {'Bridgeway': set(['Bridgeway']), 'St.': set(['Webster St.']), 'Cres': set(['Wellesley Cres']), 'Rd': set(['Ascot Rd']), 'Pulgas': set(['Alameda de Las Pulgas', 'Alamed de las Pulgas']), 'East': set(['Francisco Boulevard East']), 'Alameda': set(['The Alameda']), 'D': set(['Marina Boulevard Building D']), 'avenue': set(['Santa Cruz avenue']), 'Plz': set(['Woodside Plz']), 'Embarcadero': set(['The Embarcadero']), '730': set(['Sansome Street Ste 730']), 'way': set(['Orinda way']), 'Path': set(['Indian Rock Path', 'Parnassus Path']), 'Post': set(['Post']), 'Building': set(['Multi Use Building']), 'Center': set(['Westlake Center', 'South Shore Center', 'Bon Air Center']), 'Plaza': set(['Manor Plaza', 'Mint Plaza', 'Civic Center Plaza']), 'St': set(['Park St', '24th St']), 'Ave': set(['Floribunda Ave', 'Geneva Ave', 'Thorton Ave']), '100': set(['Woodside Road, Suite 100']), 'Bay': set(['Bay']), 'Blvd.': set(['East Francisco Blvd.']), '39': set(['Pie

In [93]:
# print county list
    
print "Unique values for counties"
print county_list

Unique values for counties
defaultdict(<type 'set'>, {'addr:county': set(['Contra Costa', 'Alameda']), 'gnis:county_name': set(['Marin', 'San Francisco', 'Contra Costa', 'Alameda', 'San Mateo']), 'gnis:county_id': set(['075', '081', '085', '001', '013', '041'])})


In [120]:
# print distinct keys for zip and their values
    
print "Zip Values"
pprint.pprint(dict(zip_val))

Zip Values
{'tiger:zip_left': set(['94014; 94112',
                        '94037; 94044',
                        '94109;94115',
                        '94112; 94131',
                        '94112;94127',
                        '94115; 94123',
                        '94526; 94507',
                        '94530:94801',
                        '94605:94621',
                        '94605; 94619',
                        '94607; 94608',
                        '94703:94709',
                        '94704; 94609',
                        '94925; 94920',
                        '94941:94965']),
 'tiger:zip_right': set(['94014; 94112',
                         '94109;94115',
                         '94526; 94507',
                         '94530:94801',
                         '94605:94621',
                         '94607; 94608',
                         '94703:94709',
                         '94704; 94609',
                         '94925; 94920',
                         '94

### Apply function to clean the data (where problems are detected)

In [9]:
# Functions to clean the data

# update Address values   
mapping = { "St": "Street",
            "St.": "Street",
            "Ave":"Avenue",
            "Rd":"Road",
            "Pl":"Place",
            "Blvd":"Boulevard",
            "Ave":"Avenue",
            "avenue":"Avenue"
            }


def update_address(name):    

    for k,v in mapping.iteritems():
        m = pattern.search(name)
        #print 'm.group() -->',m.group()
        #print 'Old Name -->',name
        if m:
            if m.group() == k:
                name = string.replace(name, k, v, 1)
                #print 'Map -->',k,':',v,'| ', 'Changed Name -->',name
                break
            

    
    return name


# Update county values
def update_county(county):
    
    # remove , CA from county names if exists    
    if ',' in county:
        pos=county.index(",")
        county=county[:pos]
    
    return county
    
    

# Update zip values: If zipcodeis longer than 6 characters, keep first 6 characters only 
def update_zip(zip):
    
    if len(zip)>6:
        zip = zip[:5]
    
    return zip


### Clean the data where needed and save the results in JSON file format

In [10]:

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]



pattern_county = re.compile(r':county[_name]*$')
pattern_zip = re.compile(r'zip|postcode')
pattern_digit = re.compile(r'^\d*\d$')

def shape_element(element):
    node = {}
                    
    if element.tag == "node" or element.tag == "way" :
        
        '''
        - all attributes of "node" and "way" should be turned into regular key/value pairs, except:
            - attributes in the CREATED array should be added under a key "created"
            - attributes for latitude and longitude should be added to a "pos" array,
              for use in geospacial indexing. Make sure the values inside "pos" array are floats
              and not strings. 
        '''
        node["created"] = {}
        node["pos"] = []       
        node["type"] = element.tag
        
        for key, value in element.attrib.items():
                
                
            if key in CREATED:
                node["created"][key] = value

            elif key in ["lat","lon"]:
                node["pos"].insert(0,float(value))
                
            else:
                node[key]  = value
                
                
            

        ''' 
        - if the second level tag "k" value contains problematic characters, it should be ignored
        - if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
        - if the second level tag "k" value does not start with "addr:", but contains ":", you can
          process it in a way that you feel is best. For example, you might split it into a two-level
          dictionary like with "addr:", or otherwise convert the ":" to create a valid key.

        '''
        
        create_addr = 0
        for tag in element.iter("tag"):
            
            # if the values are not problematic, start populating the dictionary
            if not(re.search(problemchars, tag.attrib["k"])):
                
                '''or 'zip' in element.attrib["k"]:'''
                # Initialize dict address
                if (tag.attrib["k"][:5]=="addr:" and tag.attrib["k"].count(':') == 1) or pattern_county.search(tag.attrib["k"]) or pattern_zip.search(tag.attrib["k"]):
                    if create_addr==0:
                        node["address"]={}
                    create_addr = 1                   


                # if tag attribute is :addr
                if tag.attrib["k"][:5]=="addr:":
                    
                    
                    # Initialize dict address
                    #if tag.attrib["k"].count(':') == 1:
                        #if create_addr==0:
                            #node["address"]={}
                        #create_addr = 1            
                        
                    # Update county values and populate
                    if pattern_county.search(tag.attrib["k"]):
                        node["address"]["county"] = update_county(tag.attrib["v"])
                        
                    # Update zip values and populate  
                    elif  pattern_zip.search(tag.attrib["k"]):
                        node["address"]["zip"] = update_zip(tag.attrib["v"])                    

                    # Update street values and populate  
                    elif tag.attrib["k"][5:] == 'street':
                        node["address"]["street"] = update_address(tag.attrib["v"])

                    # Remaining
                    else:
                        node["address"][tag.attrib["k"][5:]] = tag.attrib["v"]

                # if tag attribute is not :addr
                elif tag.attrib["k"][:5] != 'addr:':
                   
                    # Update county values
                    if pattern_county.search(tag.attrib["k"]):
                        node["address"]["county"] = update_county(tag.attrib["v"])
                        
                    # Update zip values and populate  
                    elif  pattern_zip.search(tag.attrib["k"]):
                        node["address"]["zip"] = update_zip(tag.attrib["v"])  
                        
                    # Remove non-integer characters from building-levels
                    elif tag.attrib["k"] =="building:levels":
                        if pattern_digit.search(tag.attrib["v"]):
                            node[tag.attrib["k"]] =  int(tag.attrib["v"])
                        
                    else:
                        node[tag.attrib["k"]] =  tag.attrib["v"]
  
  
        create_node_ref=0
        if element.tag == "way":
            for nd in element.iter("nd"):
                if  create_node_ref==0:
                    node["node_refs"] = []
                create_node_ref=1    
                node["node_refs"].append(nd.attrib["ref"])
                

        #pprint.pprint(dict(node))
        return node
        
    else:
        return None
    #print node


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        counter_nodes=0
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")

    return data

# NOTE: if you are running this code on your computer, with a larger dataset, 
# call the process_map procedure with pretty=False. The pretty=True option adds 
# additional spaces to the output, making it significantly larger.
data = process_map('sf_sample.osm', False)
    

   


### Visually check that the cleaning was successful

In [11]:
# Visually Check address keys
allzip = defaultdict(int)
count=0
for d in data:
    #print v
    for k in d:
        if k =="address":
            if count<30:
                print d[k]
                count+=1
    
    
    #print data[]["address"]["zip"]
    #allzip[v]+=1
    #print v

#print allzip 
#print type(data)

{'city': 'San Francisco', 'state': 'CA', 'street': 'Haight Street', 'housenumber': '1398', 'zip': '94117'}
{'street': 'Sacramento Street', 'housenumber': '500'}
{'city': 'Berkeley', 'state': 'CA', 'street': 'University Avenue', 'housenumber': '2000', 'country': 'US'}
{'street': '16th Street', 'housenumber': '3121'}
{'city': 'Alameda', 'street': 'Park Street', 'housenumber': '1223', 'zip': '94501'}
{'city': 'Berkeley', 'street': 'Shattuck Avenue', 'housenumber': '1988'}
{'street': '3rd Street', 'housenumber': '590', 'zip': '94107'}
{'street': 'Euclid Avenue'}
{'street': 'Euclid Avenue'}
{'county': 'San Mateo', 'state': 'CA'}
{'street': 'Stockton Street', 'housenumber': '1556'}
{'city': 'San Francisco', 'state': 'CA', 'street': 'Sacramento Street', 'housenumber': '3233', 'zip': '94115'}
{'street': 'Kearny Street', 'housenumber': '1260'}
{'street': 'Floribunda Avenue'}
{'city': 'Berkeley', 'street': 'Solano Avenue', 'housenumber': '1892'}
{'city': 'Albany', 'state': 'CA', 'street': 'Solan

In [12]:
# Visually Check building values
building = defaultdict(set)
count=0
for d in data:
    
    #print v
    for k in d:
        if "building" in k:
                building[k].add(d[k])
                
for k in building:
    count1=0
    print 'Distinct Value in for building key :',k
    print '----------------------------------------'
    for a in building[k]:        
        if count1<20:            
            print a
            count1+=1
            
    print '----------------------------------------'
    
    
    #print data[]["address"]["zip"]
    #allzip[v]+=1
    #print v

#print allzip 
#print type(data)

Distinct Value in for building key : building
----------------------------------------
shop
shed
industrial
portable
office
apartments
house
condominiums
mixed_use
medical
guardhouse
college
greenhouse
church
yes
portables
hangar
kindergarten
stands
no
----------------------------------------
Distinct Value in for building key : building:height
----------------------------------------
126
----------------------------------------
Distinct Value in for building key : building:part
----------------------------------------
cathedral
yes
----------------------------------------
Distinct Value in for building key : building:colour
----------------------------------------
gray
----------------------------------------
Distinct Value in for building key : building:levels
----------------------------------------
0
1
2
3
4
5
6
7
8
9
10
11
12
14
15
18
20
21
22
23
----------------------------------------
Distinct Value in for building key : building:material
----------------------------------------

In [13]:
# Most common zip-codes
allzip = defaultdict(int)

for d in data:
    #print v
    for k in d:
        if k =="address":
                for j in d[k]:
                    if j == 'zip':
                            allzip[d[k][j]]+=1
#print allzip
print_sorted_list(allzip, 'Top 20 zip values')
    

0 ('94122', 525)
1 ('94611', 344)
2 ('94116', 258)
3 ('94610', 172)
4 ('94117', 151)
5 ('94133', 150)
6 ('94118', 140)
7 ('94080', 111)
8 ('94127', 105)
9 ('94541', 103)
10 ('94103', 95)
11 ('94587', 94)
12 ('94546', 83)
13 ('94010', 82)
14 ('94605', 82)
15 ('94063', 81)
16 ('94501', 78)
17 ('94544', 78)
18 ('94560', 75)
19 ('94555', 72)
20 ('94110', 71)


#### Load cleaned JSON data into MongoDB instance

In [16]:

# Load data into MongoDB instance

from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")

# Drop database if exists
#db.osm_col.drop()

db = client.osm_col
#osm_col = db.osm_col
db.osm_col.insert_many(data)


print db.osm_col.count()



1265388
