In [1]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

FILENAME = 'minneapolis-saint-paul_minnesota.osm'

lower = re.compile(r'^([a-z]|_)*$')

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

mapping = { "St": "Street",
            "St.": "Street",
            "Rd": "Road",
            "Rd.": "Road",
            "Ave": "Avenue",
            "Ave.": "Avenue"
            }

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

In [2]:
#this is a function of judging if a value is float type or not 
def isfloat(value):
    try:
        float(value)    
        return True
    except:
        return False

In [3]:
#this is a function of updating the street name into the same ending.
def update_name(name, mapping):
    split_name = name.split(' ')
    name = ""
    for word in split_name:
        if word in mapping:
            name += mapping[word]
            break
        name += word
        name += " "
    return name

In [4]:
#this is a function of dealing with the node refs which exists only under the 'way' tag
def dealing_node_refs(element):
    node_refs = []
    for nd in element.iter("nd"):
        node_refs.append(nd.attrib["ref"])

    return node_refs

In [5]:
#this is a function of dealing with the created information
def dealing_created(element):
    created = {}
    for tag in CREATED:
        if tag in element.attrib:
            created[tag] = element.attrib[tag]

    return created

In [6]:
#this is a function of judging if the tag under address is just containing alphabet letters
def is_correct_tag(tag_name):
    tag_name = tag_name.lower()
    if lower.search(tag_name):
        return True

    return False

In [7]:
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :  
        if "id" in element.attrib:
            node["id"] = element.attrib["id"]
          
        node["type"] = element.tag

        if "visible" in element.attrib:
            node["visible"] = element.attrib["visible"]

        created = dealing_created(element)
        if len(created) > 0:
            node["created"] = created

        pos = []    
        # if the lat or lon information is absence, then the pos information should not be recorded.
        if "lat" in element.attrib and "lon" in element.attrib:
            lat = element.attrib["lat"]
            lon = element.attrib["lon"]
            if isfloat(lat) and isfloat(lon):
                pos = [float(lat), float(lon)]
                node["pos"] = pos
 
        address = {}
        for tag in element.iter("tag"):
            if tag.attrib['k'] == "addr:street":
                address["street"] = update_name(tag.attrib['v'], mapping)
            else:
                split_k = tag.attrib['k'].split(":")
                ###let the key in dictionary be lower letter. 
                ###If there is a second ":" that separates the type/direction of a street, then the tag will be ignored.
                if len(split_k) == 1 and is_correct_tag(split_k[0]):
                    node[split_k[0].lower()] = tag.attrib['v']
                elif len(split_k) == 2 and split_k[0] == 'addr' and is_correct_tag(split_k[1]):
                    address[split_k[1].lower()] = tag.attrib['v']

        if len(address) > 0:
            node["address"] = address

        if element.tag == "way":
            node_refs = dealing_node_refs(element)
            if len(node_refs) > 0:
                node["node_refs"] = node_refs
          
        return node
    else:
        return None


In [19]:
def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    count = 0
    with codecs.open(file_out, "w") as fo:
        fo.write("[")
        for _, element in ET.iterparse(file_in):
            if count > 10000:
                break
            el = shape_element(element)
            if el:
                #data.append(el) the format of json file is very important
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    count += 1
                    if count == 1:
                        fo.write(json.dumps(el))
                    else:
                        fo.write("," + "\n" + json.dumps(el))
        fo.write("]")
    ##return data

In [20]:
process_map(FILENAME, False)

In [26]:
import json
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples

In [27]:
with open('minneapolis-saint-paul_minnesota.osm.json') as f:
    data = json.loads(f.read())
    for a in data:
        db.saint.insert_one(a)

ServerSelectionTimeoutError: localhost:27017: [Errno 10061] 