In [1]:
import xml.etree.cElementTree as cET
import re
import collections
import pprint
import json
import simplejson

# finding the unique tag on the xml

In [2]:
# finding unique tag
tags = []
for _,elem in cET.iterparse('singapore.osm',('start',)):
    if not elem.tag in tags:
        tags.append(elem.tag)       
print tags

['osm', 'bounds', 'node', 'tag', 'way', 'nd', 'relation', 'member']


# unique tag count

In [3]:
# unique tag count
tag_count = {
    'osm':0,
    'bounds':0,
    'node':0,
    'tag':0,
    'way':0,
    'nd':0,
    'relation':0,
    'member':0
}
for _,elem in cET.iterparse('singapore.osm',events=('start',)):
    tag_count[elem.tag]+=1
print tag_count

{'node': 1426354, 'member': 94175, 'tag': 663789, 'osm': 1, 'way': 220418, 'relation': 2551, 'nd': 1765632, 'bounds': 1}


# tag description
1) node - lat and lng of each point                                               
2) tag - describe about the tag which it got enclosed.it has to attribute k and v eg: tag decribe about node                 
3) way- it consist of nd tags reffrence to each node to form a ploygon line eg: highway road                       
4) relation - A relation is a multi-purpose data structure that documents a relationship between two or more data elements (nodes, ways, and/or other relations)              
5) bounds - it tells the max lat and lng of that dataset                      
6) osm - tells the version and some meta data of this xml file                  

# total tag count

In [4]:
count = 0
for _,elem in cET.iterparse('singapore.osm',events=('start',)):
    if elem.tag == 'node' or elem.tag == "way":
        for tag in elem.iter('tag'):
            count+=1
print count

642365


# auditing
finding some intresting street rather than our expected street

In [35]:
# used for finding the last word because
expecteds = ['avenue','street','center','central','drive','crescent','boulevard','jalan','gardens','hill','link','park',
           'place','road','terrace','view','walk','way','close','circle','lane','garden']
# return true if it is street
def is_street(elem):
    if elem.attrib['k'] == 'addr:street':
        return True
    else:
        return False
#gives street other than our expected street
def audit_street(street_types,val):
    flag = 0
    for expected in expecteds:
        expected_expression = re.compile(expected,re.IGNORECASE)
        if expected_expression.search(val) == None:
            flag = 0
        else:
            flag = 1
            break
    if flag == 0:
        last_word=re.compile('\s(\w+)$')
        word=last_word.search(val)   
        if not word == None:
                street_types[word.group().strip()].add(val)            
# findind inconsistent street
def inconsistent_streets(filename):
    street_types = collections.defaultdict(set)
    for _,elem in cET.iterparse(filename,events=('start',)):
        if elem.tag == 'node' or elem.tag == 'way':
            for tag in elem.iter('tag'):
                if is_street(tag):
                    val = tag.attrib['v']
                    audit_street(street_types,val)
    return street_types


demo=inconsistent_streets('singapore.osm')                    

In [40]:
# changing the inconsistence to correct name
def inconsistent_to_correct_name(name): 
    inconsistance_map = {
    'rd': 'Road',
    'Avebue': 'Avenue',
    'ave': 'Avenue',
    'blvd': 'Boulevard',
    'Blok': 'Block',
    'BNo': 'No',
    'Terrance': 'Terrace',
    'st': 'Street'
      }
    for inconsistent in inconsistance_map:
        regexp = re.compile(inconsistent,re.IGNORECASE)
        if regexp.search(name) != None:
            name = re.sub(inconsistent,inconsistance_map[inconsistent],name,flags=re.IGNORECASE)
    return name
# convert the inconsistent dict to correct dict
def convert_street(mydict):
    new_street = collections.defaultdict(set)
    for index in mydict:
        setelement = mydict[index]
        for val in setelement:            
            new_street[index].add(inconsistent_to_correct_name(val))
    return new_street
data=convert_street(demo)
#pprint.pprint(dict(data),width=10)

# further auditing
- normally postal code in singaore will be in length of 6 digit           
  so i'm removing the unwated postal code 
- 'addr-*' is not readable so that key values are changed to readable one

In [26]:
def is_postcode(elem):
    if elem.attrib['k'] == 'addr:postcode':
        return True
    else:
        return False
def is_valid_postcode(elem):
    if len(elem.attrib['v']) == 6:
        return True
    else:
        return False
def is_address(elem):
    is_addr = re.compile('addr',re.IGNORECASE)
    if is_addr.match(elem.attrib['k']) != None:
        return True
    else:
        return False
def correct_address(val):
    is_addr = re.compile('addr',re.IGNORECASE)
    if is_addr.match(val) != None:
        return val.split(':')[1]
    else:
        return val

# converting into dict for further storing dict to json

In [31]:
# convert the node into dict so that it will be converted into json
def node_to_dict(filename):
    data = []
    for _,elem in cET.iterparse(filename,events=('start',)):
        if elem.tag == 'node':
            node = {}
            node['uniqueId'] = int(elem.attrib['id'])
            node['loc'] = [float(elem.attrib['lat']),float(elem.attrib['lon'])]
            node['version'] = int(elem.attrib['version'])
            node['user'] = elem.attrib['user']
            tags = []
            flag = 0
            for tag in elem.iter('tag'):
                flag = 1
                temp = {}
                if is_street(tag):
                    temp['k']='street'
                    temp['v']=inconsistent_to_correct_name(tag.attrib['v'])
                elif is_postcode(tag):
                    if is_valid_postcode(tag):
                        temp['k']='postcode'
                        temp['v']=tag.attrib['v']
                elif is_address(tag):
                    temp['k']=correct_address(tag.attrib['k'])
                    temp['v']=tag.attrib['v']
                else:
                    temp['k']=tag.attrib['k']
                    temp['v']=tag.attrib['v']
                tags.append(temp)
            if flag == 1:
                node['tag'] =tags
            data.append(node)
    return data
node_data=node_to_dict('singapore.osm')

In [37]:
# convert the way into dict so that it will be converted into json
def way_to_dict(filename):
    data = []
    for _,elem in cET.iterparse('singapore.osm',events=('start',)):
        if elem.tag == 'way':
            way = {}
            way['uniqueId'] = int(elem.attrib['id'])
            way['uid'] = int(elem.attrib['uid'])
            nds = []
            for nd in elem.iter('nd'):
                nds.append(int(nd.attrib['ref']))
            way['nd'] = nds
            tags = []
            flag = 0
            for tag in elem.iter('tag'):
                flag = 1
                temp = {}
                if is_street(tag):
                    temp['k']='street'
                    temp['v']=inconsistent_to_correct_name(tag.attrib['v'])
                elif is_postcode(tag):
                    if is_valid_postcode(tag):
                        temp['k']='postcode'
                        temp['v']=tag.attrib['v']
                elif is_address(tag):
                    temp['k']=correct_address(tag.attrib['k'])
                    temp['v']=tag.attrib['v']
                else:
                    temp['k']=tag.attrib['k']
                    temp['v']=tag.attrib['v']
                tags.append(temp)
            if flag == 1:
                way['tag'] =tags
            data.append(way)
    return data
way_data = way_to_dict('singapore.osm')
            

In [39]:
node_data[20],way_data[20]

({'loc': [1.2954578, 103.8740371],
  'tag': [{'k': 'ref', 'v': '14A'},
   {'k': 'highway', 'v': 'motorway_junction'}],
  'uniqueId': 25455287,
  'user': 'cboothroyd',
  'version': 5},
 {'nd': [133662042,
   1781548512,
   1747994844,
   1531189549,
   1747994845,
   1747994847,
   133662046,
   133662048,
   133662050,
   3553412996,
   133662051,
   1781710553,
   133662053,
   1781710555,
   133662054,
   1781710557],
  'tag': [{'k': 'highway', 'v': 'motorway'},
   {'k': 'lanes', 'v': '2'},
   {'k': 'maxspeed', 'v': '90'},
   {'k': 'name', 'v': 'TPE'}],
  'uid': 4240913,
  'uniqueId': 14058412})

# writing file

In [13]:
with open('node.json', 'w') as fp:
    fp.write(simplejson.dumps(node_data,indent=4))

In [14]:
with open('way.json', 'w') as fp:
    fp.write(simplejson.dumps(way_data,indent=4))

In [13]:
print json.dumps(way_data[0],indent=4)

{
    "changeset": "46097651", 
    "uid": "5084650", 
    "nd": [
        "26778964", 
        "247749632", 
        "1275309736", 
        "1275309696", 
        "462263980", 
        "473019059", 
        "4486796339", 
        "1278204303", 
        "3689717007", 
        "246494174"
    ], 
    "version": "24", 
    "user": "woodennature", 
    "uniqueId": "4386520", 
    "tag": [
        {
            "k": "highway", 
            "v": "trunk"
        }, 
        {
            "k": "name", 
            "v": "Orchard Road"
        }, 
        {
            "k": "oneway", 
            "v": "yes"
        }
    ]
}


# importing to mongo
after writing json to file i imported the specific file using:         
mongoimport -filename node.json -c node --jsonArray                        
mongoimport -filename way.json -c way --jsonArray
# importing mongo client for further analysis

In [7]:
from pymongo import MongoClient
client = MongoClient()

In [8]:
db = client.test

# top 10 contributers 

In [26]:
# finding top contributed user
list(db.node.aggregate([{
    '$group':{"_id":"$user","count":{"$sum":1}}
},
{
    '$sort':{'count':-1}
},
{
    '$limit':10
}]))

[{u'_id': u'JaLooNz', u'count': 320058},
 {u'_id': u'berjaya', u'count': 103080},
 {u'_id': u'rene78', u'count': 73724},
 {u'_id': u'cboothroyd', u'count': 68382},
 {u'_id': u'lmum', u'count': 39556},
 {u'_id': u'kingrollo', u'count': 36584},
 {u'_id': u'Luis36995', u'count': 35363},
 {u'_id': u'ridixcr', u'count': 31911},
 {u'_id': u'Sihabul Milah', u'count': 30405},
 {u'_id': u'calfarome', u'count': 29240}]

# number of node and ways

In [25]:
print db.way.find({}).count()
print db.node.find({}).count()

220418
1426354


# find the number of areas like hospital museum and garden

In [22]:
db.node.find({'tag.v':{'$regex':'hospital','$options':'i'}}).count()

77

In [23]:
db.node.find({'tag.v':{'$regex':'museum','$options':'i'}}).count()

29

In [24]:
db.node.find({'tag.v':{'$regex':'garden','$options':'i'}}).count()

165

# finding unique user count

In [41]:
list(db.node.aggregate([
    {
        '$group':{'_id':'$user'}
    },
   { '$group': { '_id': 1, 'count': { '$sum': 1 } } }
]))

[{u'_id': 1, u'count': 1647}]

# some improvement about dataset
all the text under singapore is consisting of malay,mandrin,tamil and english so distingusing                       
street are difficult so if language translation made properly it'll be easy for foriegn travallers to use 
the map
for example i found that avenue is declered as Aenue
# solution    
## benefit   
-  it helps foriegn traveller to read
## issue
- sigapore is a diverse nation so finding right contributor to 
  solve will be difficult                                              
    <br>
    <br>
### singapore have restructered their postal code system [https://en.wikipedia.org/wiki/Postal_codes_in_Singapore]     
# solution 
## benefit
- it'll be easy for traveller to navigate in singapore
## issue
- it'll be difficult for locals to navigate because they familiar with their old
  system


'Sd'