# 清洗数据 并存入Json文件

In [53]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
  process it in a way that you feel is best. For example, you might split it into a two-level
  dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""

OSM_FILE = "./shenzhen_china.osm/shenzhen_china.osm"
sample_file = "sample.osm"

POS = ["lon","lat"]

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd.": "Road",
            "Ave": "Avenue",
            "Rd": "Road",
            "S.": "Street"
            }
# 更新街道名
def update_name(name, mapping):
    for key in mapping.keys():
        if key in name:
            t = name.split(" ")
            t[-1] = mapping[key]
            name = ""
            for s in t:
                name += " " + s
    return name.strip()

# 完善街道名
def audit_street_type(street_name):
    
    m = street_type_re.search(street_name)
    if m:        
        street_type = m.group()
        if street_type not in expected:
            print(street_name)
            better_name = update_name(street_name,mapping)
            return better_name
        else:
            return street_name
    else:

        return street_name
    
            
def shape_element(element):
    node = {}
    address = {}
    created = {}
    node_refs = []
    pos = []
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag

        # As 'element.attrib' is a dictionary, you can search it for keys directly:
        if 'lat' in element.attrib.keys() and 'lon' in element.attrib.keys():
            try:
                lat = float(element.attrib['lat'])
                lon = float(element.attrib['lon'])
                pos.insert(0,lat)
                pos.insert(1,lon)
            # or pos = [lat,lon]
            except:
                pass
        if 'visible'in element.attrib.keys():
            node['visible'] = element.attrib['visible']
        for k, m in element.attrib.items():
            # you have taken care of 'lat' and 'lon' so skip those:
            if k not in POS:
                if k in CREATED:
                    created[k] = m
                else:
                    node[k] = m
    
        for child in element:
            if child.tag == "nd":
                #node["node_refs"].append(child.attrib['ref'])
                node_refs.append(child.attrib['ref'])

            elif child.tag == "tag":
                if child.attrib['k'] == 'addr:housenumber':
                # this is where the key/values of the dictionary are created
                    address['housenumber']=child.attrib['v'] 
                if child.attrib['k'] == 'addr:postcode':
                # this is where the key/values of the dictionary are created
                    #print child.attrib['v']
                    if 'D' in child.attrib['v']:
                        return None
                    else:
                        address['postcode']=child.attrib['v']
                if child.attrib['k'] == 'amenity':
                    node['amenity'] = child.attrib['v']
                if child.attrib['k'] == 'name':
                    node['name'] = child.attrib['v']
                if child.attrib['k'] == 'cuisine':
                    node['cuisine'] = child.attrib['v']
                if child.attrib['k'] == 'cuisine':
                    node['phone'] = child.attrib['v']
                       
                if child.attrib['k'] == 'addr:street':
                # this is where the key/values of the dictionary are created   
                    address['street'] = audit_street_type(child.attrib['v'])
                    
        if created:
            node['created'] = created
        if pos:
            node['pos'] = pos
        if address:
            node['address'] = address
        if node_refs:
            node['node_refs'] = node_refs
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map(OSM_FILE, False)

if __name__ == "__main__":
    test()

中兴路 Zhongxing Rd
湖贝路 Hubei Rd
Shenyan Lu
1002 Huaqiang N Rd, HuaQiang Bei, Futian Qu, Shenzhen Shi, Guangdong Sheng, China, 518000
Hua Fa Bei Lu
Tai zi
Hai yue Rd
kamtin
Castle Peak Road - Yuen Long
青山公路 - 元朗段 Castle Peak Road - Yuen Long
Castle Peak Road - Yuen Long
福华路 Fuhua road
福华路 Fuhua road
Gong ye 7th Rd
Gaoxin S.
Castle Peak Road - Yuen Long
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
Ko Tong
北环大道 North Ring Ave
Guanlan Longhua New District
民田路 Mintian road
坂雪岗大道 Bǎnxuě Gǎng Av
罗芳路 Luofang Rd
罗芳路 Luofang Rd
罗芳路 Luofang Rd
新秀路 Xinxiu Rd
延芳路 Yanfang Rd
延芳路 Yanfang Rd
文锦中路 Wenjin Middle Rd
文锦中路 Wenjin Middle Rd
Zhenghua Rd
广深公路 Gua

# 存入Mongodb 数据库

In [54]:
import json

# 导入Mongodb数据库
def insert_data(data, db):
    db.ShenZhenOpen.insert_one(data)
    


if __name__ == "__main__":
    
    # 连接Mongodb数据库
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = client.osm
    
    # 打开 json 文件 导入Mongodb数据库
    with open('./shenzhen_china.osm/shenzhen_china.osm.json') as f:
        for line in f:
            data = json.loads(line)
            insert_data(data, db)

# 所有文档数

In [55]:
def print_doc_num(query):
    print db.ShenZhenOpen.find(query).count()

print_doc_num({})

730333


# 所有节点数

In [56]:
def print_doc_num(query):
    print db.ShenZhenOpen.find(query).count()

print_doc_num({"type":"node"})

658996


# 所有道路数

In [123]:
def print_doc_num(query):
    print db.ShenZhenOpen.find(query).count()

print_doc_num({"type":"way"})

71337


# 所有用户数

In [58]:
result= db.ShenZhenOpen.distinct("created.user")
print len(result)

798


# 前100位用户总贡献数

In [59]:
import pprint

def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
    #pprint.pprint(list(result))
    return result


last = print_doc_aggregate([
        {"$group":{"_id":"$created.user","count":{"$sum":1}}},
        {"$sort":{"count":-1}},
        {"$limit":100}])

#计算贡献总数
x = 0
for e in last:
    x += int(e["count"])    
print(x)


697548


# 前590位用户贡献总数

In [60]:
import pprint

def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
    #pprint.pprint(list(result))
    return result


last = print_doc_aggregate([
        {"$group":{"_id":"$created.user","count":{"$sum":1}}},
        {"$sort":{"count":1}},
        {"$limit":590}])
x = 0
for e in last:
    x += int(e["count"])
    
print(x)

7563


In [7]:
def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
    pprint.pprint(list(result))


print_doc_aggregate([
        {"$group":{"_id":"$created.user","count":{"$sum":1}}},
        {"$match":{"count":1}},
        {"$group":{"_id":"$count","num_users":{"$sum":1}}}
])



[{u'_id': 1, u'num_users': 175}]


# 前10个最多的便利设施

In [61]:
def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
    pprint.pprint(list(result))


print_doc_aggregate([
        {'$match':{'amenity':{'$exists':1}}},
        {'$group':{'_id':'$amenity','count':{'$sum':1}}},
        {'$sort':{'count':-1}},
        {'$limit':10}
])

[{u'_id': u'parking', u'count': 543},
 {u'_id': u'toilets', u'count': 418},
 {u'_id': u'school', u'count': 339},
 {u'_id': u'restaurant', u'count': 237},
 {u'_id': u'shelter', u'count': 225},
 {u'_id': u'post_box', u'count': 166},
 {u'_id': u'place_of_worship', u'count': 140},
 {u'_id': u'bus_station', u'count': 123},
 {u'_id': u'bank', u'count': 122},
 {u'_id': u'fuel', u'count': 110}]


# 前5个 最多的餐厅

In [122]:


def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
#     print(list(result))
    return (list(result))


y =print_doc_aggregate([
        {'$match':{'amenity':{'$exists':1}}},
        {'$match':{'amenity':'restaurant'}},
        {'$match':{'name':{'$exists':1}}},
        {'$group':{'_id':'$name','count':{'$sum':1}}},
        {'$sort':{'count':-1}},
        {'$limit':5}
])

# 中文输出
for e in y:
    for key, value in e.items():
        if(isinstance(value,str)):
            print key + ":"+ value.decode("ascii").encode("utf-8")
        else:
            print key,value


count 3
_id Pizza Hut
count 2
_id 沙县小吃
count 2
_id 譚仔三哥
count 1
_id 实验餐厅
count 1
_id 荔山餐厅


# 数量最多的银行

In [124]:

def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
    return list(result)

y = print_doc_aggregate([
        {'$match':{'amenity':{'$exists':1}}},
        {'$match':{'amenity':'bank'}},
        {'$match':{'name':{'$exists':1}}},
        {'$group':{'_id':'$name','count':{'$sum':1}}},
        {'$sort':{'count':-1}},
        {'$limit':1}])

# 中文输出
for e in y:
    for key, value in e.items():
        if(isinstance(value,str)):
            print key,value.decode("ascii").encode("utf-8")
        else:
            print key,value


count 28
_id 工商银行


# 数量最多的宗教设施

In [117]:
def print_doc_aggregate(query):
    result=db.ShenZhenOpen.aggregate(query)
    return list(result)


y = print_doc_aggregate([
        {'$match':{'amenity':{'$exists':1}}},
        {'$match':{'amenity':'place_of_worship'}},
        {'$match':{'name':{'$exists':1}}},
        {'$group':{'_id':'$name','count':{'$sum':1}}},
        {'$sort':{'count':-1}},
        {'$limit':3}
])

# 中文输出
for e in y:
    for key, value in e.items():
        if(isinstance(value,str)):
            print key + ":"+ value.decode("ascii").encode("utf-8")
        else:
            print key,value

count 3
_id 天后廟 Tin Hau Temple
count 3
_id 天后宮 Tin Hau Temple
count 2
_id 天后古廟 Tin Hau Temple
