### City: Pune, India

The city of Pune has been chosen , because its one of the fastest growing cities in India and is rapidly becoming a hub for MNCs and IT corporations. Also, a lot of places are named in English (unlike other cities where local languages are used),and the streets and roads are well organized.

>Reference :https://mapzen.com/data/metro-extracts/metro/pune_india/

### Issues to be Solved:

1.The streets are abbreviated incorrectly.

2.Pincodes (Zipcodes) are incorrect.


### 1. Street Names

In the following code, some of the the street types, the places such as shops and parks which have to be corrected are identified.

In [3]:

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

osm_file = "pune_india.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE) # regular expression to match the last word of a string 
street_types=defaultdict(set) 

#list of street types              
expected = ["Street", "Avenue", "Gate", "Town", "Block","Marg","Drive", "Place", "Square", "Lane", "Road","Path" ,
            "Trail", "Park", "Commons","Phase","World","Lake","Nagar","Circle","Centre","Society",
            "Centre","Colony","Mall","Bazaar","Plaza","Stop","Stage","Station","Bunk","Area","Annexe","City","Ridge","Apartment"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name) # Searching for the last word in Street Name. 
    if m:
        street_type = m.group() # Groups for specific street types 
        if street_type not in expected: #Checking if the last word is present in the 'expected ' list of street types.
            street_types[street_type].add(street_name)
    
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street") 


street_types = defaultdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if is_street_name(tag): 
                audit_street_type(street_types, tag.attrib['v']) 

In [29]:
#Last words of street names which were not present in the expected List. (Most of the names in Local Language)
for i in street_types:
    print i

Pashan,
Gymkhana
udyog
Karvenagar
Peth
nagar
Aundh
Rd
pedestrian
chaulk
Narhe
Vitthalwadi
Bypass
Trillium
BRTS
Wakdewadi
Dhanakwadi
Kharadi
Magarpatta
26
apartment
Hadapsar
Road,
Kothrud
Chinchwadgaon
Vasti,Bavdhan
Katraj
Chambers
1
Deccan
raod
2
5
4
7
Vihar
Manikbaug
Vidyanagar
Sheri
J13
Saudagar
Lavale
Highway
road
Pune
Padal
Thergaon
Bridge
Gokhalenagar
University
Ramp
Satellite
Flyover
Tingrenagar
Thakarenagar
Pashan
lavale
Yerwada
Chowk
Nilakh
Erandawane,
Hinjawadi
Swargate,
marketyard,pune
Roseicon
Warje,
Estate
cross
Exchange
Shivajinagar
Campus
411052
MIDC
Balewadi
Akurdi
chowk
Wanwadi
10
kharadi
34/2
Wakad
Market
8


### 2. Post Codes
Pune is a city in the district of the same name. Pincodes which are out of the range of city limits are filtered out.

In [4]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
pin_code_outside=[] #List of the Pincodes not in the city limits. 
pin_code_inside=[] #List of pincodes which are within the city limits, ie. from 411001 to 411053.
i=0
white_space=re.compile(r'\S+\s+\S+') 
for event, elem in ET.iterparse(osm_file):
    if elem.tag == "node" or elem.tag == "way": 
        for tag in elem.iter("tag"):
            if tag.attrib['k'] == "postal_code" or tag.attrib['k'] == "addr:postcode": 
                #some postalcodes are wrongly entred as a string
                if tag.attrib['v']=='Paschimanagari' or tag.attrib['v'] == 'spine Road':
                    i=i+1
                    #print tag.attrib['v']
                    continue
                #finding number of postal code have white space in between ie.. "411 012"
                elif white_space.search(tag.attrib['v']):
                    #print tag.attrib['v']
                    i=i+1
                    continue
                elif int(tag.attrib['v'].strip())<411001 or int(tag.attrib['v'].strip())>411053:                  
                    pin_code_outside.append(tag.attrib['v'])
                elif int(tag.attrib['v'].strip())>411001 or int(tag.attrib['v'].strip())<411053:                  
                    pin_code_inside.append(tag.attrib['v'])
print "Number of postal codes wrongly entered :",i                    
print "Number of Postal codes which line outside the city : ",len(pin_code_outside)
print "Number of Postal codes which belong to city limits 411001-411053 :",len(pin_code_inside)

Number of postal codes wrongly entered : 14
Number of Postal codes which line outside the city :  37
Number of Postal codes which belong to city limits 411001-411053 : 647



from the given data(uncleaned)

<b>14</b> postal codes were worngly specified ie..contains white-space characters or non-numericals

<b>37</b> postal codes are lying outside the city pincode range(411001-411053) 

<b>647</b> postal codes lie inside the city pincode range range

> note: uncomment each print statements to chek the wrongly entered postal code values from now

## Auditing

#### 1.Street Name Cleaning

In [16]:
#Updating the wrongly entred street names
from collections import defaultdict
import re
import pprint
import xml.etree.cElementTree as ET
#Let us create a dictionary to correct the wrongly entered street names.
mapping = { "St": "Saint","udyog":"Udyog","pedestrian":"Pedestrian","chaulk":"Chowk",
            "St.": "Street",
            "Ave":"Avenue","chowk":"Chowk","J13":'',
            "Rd":"Road","cross":"Cross",
            "Rd.":"Road",
            "nagar":"Nagar","road":"Road","raod":"Road",
           "apartment":"Apartment","no.":""," , Pune":""
               
            }
keys=mapping.keys() #creating a list of keys present in mapping dictionary
#print keys
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
PROBLEMCHARS = re.compile(r'[=\+/&-<>;\'"\?%#$@\,\. \t\r\n]') #regular expression to identify problematic charecters in a string
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+') #regular expression to match or search "abc:def"

def update_street_name(name, mapping):#function to update wrongly entred street names
#To check if the last word is in the key list, and if it is, then coreccting it and returning the whole word.
        m = street_type_re.search(name)
        if m:
            street_type=m.group()
    
            if street_type in keys:
                value=mapping[street_type]
                y=name.find(street_type)#finds the index of wrongly entred street type in street name
                z=name[:y]+value # street name + updated street type
                return z
# There are some street names which ends with numbers such as "Phase 2,Avenue 5 etc...", The following codes will remove the numnerical values.
            else:
                try: 
                    type(int(street_type))
                    position=name.find(street_type)
                    remove_numbers=name[:position]
                    return remove_numbers
            #If street names have "," ,")" in the last word they are replaced with ""
                except ValueError:
                    x = name.replace(", "," ").replace(" ,"," ").replace("No. 34/2","").replace(" No.","").replace(" no.","").replace(",","")
                    return x

In [None]:
#Street names after cleaning
for event, elem in ET.iterparse(osm_file, events=("start",)):
    if elem.tag == "node" or elem.tag == "way": 
        for tag in elem.iter("tag"):
            if is_street_name(tag):#checking if it is a street name
                #print "Before:",tag.attrib['v']
                #print "After:",update_street_name(tag.attrib['v'], mapping)            

#### 2. Postal Code cleaning

In [23]:
#Correcting wrongly entered pincodes.
white_space=re.compile(r'\S+\s+\S+')
COLON= re.compile(r'^([a-z]|_)+:')
def update_pincode(pincode):
    if white_space.search(pincode):
        x=pincode.replace(" ","") #replacing the white space in pincodes "411 210 " with "411210"
        #returning the corrected value
        return x 
#below codes returns None if the Postal code is wrongly entred  or the Postal codes lie outside the city pincode range 
    elif pincode=='Paschimanagari' or pincode== 'spine Road': #after testing it's found that some postal code value is entred as string
        return None
    elif int(pincode)<411001 or int(pincode)>411053:
        return None
    elif COLON.search(pincode):#after testing it's found that some pincode is entred as "en:Talegaon railway station" 
        return None
    else:
        return pincode     

In [25]:
#Postal codes after cleaning
for event, elem in ET.iterparse(osm_file):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if tag.attrib['k'] == "postal_code" or tag.attrib['k'] == "addr:postcode":
                #print "Before :",tag.attrib['v']
                #print "After :",update_pincode(tag.attrib['v'])



## Data Base for SQL

In [49]:
#After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
#The codes in this cell will update the street names and postal codes and convert them from XLM to CSV 
#These csv files can then easily be imported to a SQL database as tables.
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
import codecs
import csv
import schema
import cerberus
SCHEMA=schema.schema
OSMFILE='sample.osm' #using sample as a OSM file
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

# Make sure the fields order in the csv matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

#This function update the street names and postal codes by calling update_street_names and update_pincode functions
def shape_element(element, default_tag_type='regular'):
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = [] 
    node_tags={}
    # checking if the tag is "Node"
    if element.tag == 'node':
        for k in element.attrib:
            #checking if 'K ' is in NODE_FIELDS
            if k in NODE_FIELDS:
                node_attribs[k]=element.attrib[k]
        for x in element:
            # Checking if the element has the child tag "tag"
            if x.tag=='tag':
                # searching for problematic charecters
                if PROBLEMCHARS.search(x.attrib["k"]):
                    continue
                elif LOWER_COLON.match(x.attrib['k']) :
                    node_tags['id']=element.attrib['id']
                    #spliting "key" and "type"  in 'K' attribute
                    node_tags['key']=x.attrib['k'].split(":",1)[1] #
                    node_tags['type']=x.attrib['k'].split(":",1)[0]
# update_street_name" function will equate the cleaned street name  
                    if x.attrib['k']=='addr:street':
                        node_tags['value']=update_street_name(x.attrib['v'],mapping)
# update_pincode" function will equate the filtered postal codes  
                    elif x.attrib['k']=='addr:postcode':
                        if update_pincode(x.attrib['v']):
                            node_tags['value']=update_pincode(x.attrib['v'])
                        else:
                            continue
# If the value of k is " postal_code" and by calling "update_pincode" function will equate the filtered postal codes 
#The 'type' here, will be 'regular'
                elif x.attrib['k']=='post_code':
                    if update_postalcode(x.attrib['v']):
                        node_tags["value"]=update_postalcode(x.attrib["v"])
                        node_tags["type"]='regular'
                        node_tags["key"]=x.attrib["k"]
                        node_tags["id"]=element.attrib["id"]
                    else:
                        continue
#Now for the remaining k values.
                else:
                    node_tags["type"]='regular'
                    node_tags["key"]=x.attrib["k"]
                    node_tags["id"]=element.attrib["id"]
                    node_tags["value"]=x.attrib["v"]
                tags.append(node_tags)
        return {'node': node_attribs, 'node_tags': tags}
# Now for Way,
    elif element.tag == 'way':
        for x in element.attrib:
            if x in WAY_FIELDS:
                way_attribs[x]=element.attrib[x]
        count=0
        for l in element.iter("nd"):
            way_nodes.append({'id':element.attrib['id'],'node_id':l.attrib['ref'],'position':count})
            count+=1
        for y in element:
            if y.tag=='tag':
                if PROBLEMCHARS.search(y.attrib["k"]):
                    continue
                elif LOWER_COLON.match(y.attrib['k']):
                    node_tags['id']=element.attrib['id']
                    node_tags['key']=y.attrib['k'].split(":",1)[1]
                    node_tags['type']=y.attrib['k'].split(":",1)[0]
                    if y.attrib['k']=='addr:street':
                        node_tags['value']=update_street_name(y.attrib['v'], mapping)
                    elif y.attrib['k']=='addr:postcode':
                        node_tags['value']=update_pincode(y.attrib['v'])
                elif y.attrib['k']=='post_code':
                    node_tags["value"]=update_postalcode(y.attrib["v"])
                    node_tags["type"]='regular'
                    node_tags["key"]=y.attrib["k"]
                    node_tags["id"]=element.attrib["id"]
                else:
                    node_tags["type"]='regular'
                    node_tags["key"]=y.attrib["k"]
                    node_tags["id"]=element.attrib["id"]
                    node_tags["value"]=y.attrib["v"]
                tags.append(node_tags) 
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}   
    
#THE FOLLOWING CODE IS TO CONVERT THE XML INTO CSV 
# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem           
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(OSMFILE, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'wb') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'wb') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'wb') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'wb') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'wb') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(OSMFILE, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(osm_file, validate=False)