In [14]:
'''
In order to start the data wrangling process, the following modules in python will be essential.
'''

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

# opening file in filename
filename = open("ahmedabad.osm", "r")

In [15]:
# count the number of unique element types

'''
First we will create an empty dictionary and then parse through the tags to count them.
'''

tags = {}

for event, elem in ET.iterparse(filename):
    if elem.tag in tags: 
        tags[elem.tag] += 1
    else:
        tags[elem.tag] = 1
        
pprint.pprint(tags)

{'bounds': 1,
 'member': 2288,
 'nd': 639811,
 'node': 550820,
 'osm': 1,
 'relation': 510,
 'tag': 99409,
 'way': 82308}


In [17]:
# lets look at number of unique users having edited the map for Phoenix Arizona

'''
By creating the process_map function, we start by creating an empty set, then parse through elements in the file. 
If the element attribute "uid" is found, it is added to the set. All we have to do then is to call the length of the set in order
to find the number of unique users.
'''

filename = open("ahmedabad.osm", "r")

def process_map(filename):
    users = set()
    for i, element in ET.iterparse(filename):
        for elem in element:
            if 'uid' in elem.attrib:
                users.add(elem.attrib['uid'])
    return users

users = process_map(filename)
len(users)

368

In [18]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE_sample = "ahmedabad.osm"
regex = re.compile(r'\b\S+\.?', re.IGNORECASE)

expected = ["Ahmedabad", "Road", "NR", "Avenue", "SBK", "Gandhi", "Bridge", "Society"] #expected names in the dataset

mapping = {"ahmedabad": "Ahmedabad",
           "Ahmadabad": "Ahmedabad",
           "Ahamadabad": "Ahmedabad",
           "Nr.": "NR",
           "Ave.": "Avenue",
           "sbk": "SBK",
           "gandhi": "Gandhi",
           "bridge": "Bridge",
           "road": "Road",
           "Ft.": "Feet",
           "ft": "Feet",
           "Rd": "Road",
           "Rd.": "Road",
           "rasta": "Road",
           "Roads": "Road",
           "society": "Society",
           "soc.": "Society",
           "Socity": "Society",
           "Gujarat.": "Gujarat,"
            }

# Search string for the regex. If it is matched and not in the expected list then add this as a key to the set.
def audit_street(street_types, street_name): 
    m = regex.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem): # Check if it is a street name
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile): # return the list that satify the above two functions
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street(street_types, tag.attrib['v'])

    return street_types

pprint.pprint(dict(audit(OSMFILE_sample))) # print the existing names

def string_case(s): # change string into titleCase except for UpperCase
    if s.isupper():
        return s
    else:
        return s.title()

# return the updated names
def update_name(name, mapping):
    name = name.split(' ')
    for i in range(len(name)):
        if name[i] in mapping:
            name[i] = mapping[name[i]]
            name[i] = string_case(name[i])
        else:
            name[i] = string_case(name[i])
    
    name = ' '.join(name)
   

    return name

update_street = audit(OSMFILE_sample) 

# print the updated names
for street_type, ways in update_street.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name 

{'100': set(['100 Ft. Road', '100 ft Road']),
 '101,': set(['101, Paritosh Building, Usmanpura, Ashram Road']),
 '120': set(['120 Feet Ring Road']),
 '132': set(['132 Ft. Ring Road']),
 '2nd': set(['2nd Floor, Aditya Complex, Opp kasturi Dining Hall, Paldi Cross Road, Paldi',
             '2nd Floor, Dev Arc Mall, Satellite Rd, Off S.G Highway, Near Iskcon Bridge']),
 '8': set(['8 aasiyana society jivrajpark nr 132 ring road ahmedabad']),
 'ACHER': set(['ACHER ROAD']),
 'ASarwa': set(['ASarwa']),
 'Adalaj': set(['Adalaj -sarkhej road']),
 'Ahmedabad-Kalol': set(['Ahmedabad-Kalol Highway']),
 'Airport': set(['Airport Road']),
 'Akhabarnagar': set(['Akhabarnagar Circle']),
 'Akshar': set(['Akshar Avenue']),
 'Anand': set(['Anand nagar road']),
 'Ashram': set(['Ashram Road']),
 'Asmita': set(['Asmita Society']),
 'B/H': set(['B/H AryaVilla']),
 'BRTS': set(['BRTS Route', 'BRTS Station']),
 'Balol': set(['Balol Nagar Road']),
 'BalolNagar': set(['BalolNagar Cross Road']),
 'Balvantrai': se

In [21]:
filename = open("ahmedabad.osm", "r")

mapping = {"ahmedabad": "Ahmedabad",
           "Ahmadabad": "Ahmedabad",
           "Ahamadabad": "Ahmedabad",
           "Nr.": "NR",
           "Ave.": "Avenue",
           "sbk": "SBK",
           "gandhi": "Gandhi",
           "bridge": "Bridge",
           "road": "Road",
           "Ft.": "Feet",
           "ft": "Feet",
           "Rd": "Road",
           "Rd.": "Road",
           "rasta": "Road",
           "Roads": "Road",
           "society": "Society",
           "soc.": "Society",
           "Socity": "Society",
           "Gujarat.": "Gujarat,"
            }


In [22]:
mapping2 =  {"E"  : "East",
             "E." : "East",
             "N"  : "North",
             "N." : "North",
             "S"  : "South",
             "S." : "South",
             "W"  : "West",
             "W." : "West"}

In [23]:
'''
The update name function implements the change. If a street name has the defined string which is defined in the two mapping
dictionaries, then the change is made as defined.
'''


def update_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = re.sub(regex, mapping[street_type], name)
    
    return name

In [24]:
# Search string for the regex. If it is matched and not in the expected list then add this as a key to the set.
def audit_street(street_types, street_name): 
    m = regex.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem): # Check if it is a street name
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile): # return the list that satify the above two functions
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street(street_types, tag.attrib['v'])
                    

    return street_types

pprint.pprint(dict(audit(OSMFILE_sample))) # print the existing names

{'100': set(['100 Ft. Road', '100 ft Road']),
 '101,': set(['101, Paritosh Building, Usmanpura, Ashram Road']),
 '120': set(['120 Feet Ring Road']),
 '132': set(['132 Ft. Ring Road']),
 '2nd': set(['2nd Floor, Aditya Complex, Opp kasturi Dining Hall, Paldi Cross Road, Paldi',
             '2nd Floor, Dev Arc Mall, Satellite Rd, Off S.G Highway, Near Iskcon Bridge']),
 '8': set(['8 aasiyana society jivrajpark nr 132 ring road ahmedabad']),
 'ACHER': set(['ACHER ROAD']),
 'ASarwa': set(['ASarwa']),
 'Adalaj': set(['Adalaj -sarkhej road']),
 'Ahmedabad-Kalol': set(['Ahmedabad-Kalol Highway']),
 'Airport': set(['Airport Road']),
 'Akhabarnagar': set(['Akhabarnagar Circle']),
 'Akshar': set(['Akshar Avenue']),
 'Anand': set(['Anand nagar road']),
 'Ashram': set(['Ashram Road']),
 'Asmita': set(['Asmita Society']),
 'B/H': set(['B/H AryaVilla']),
 'BRTS': set(['BRTS Route', 'BRTS Station']),
 'Balol': set(['Balol Nagar Road']),
 'BalolNagar': set(['BalolNagar Cross Road']),
 'Balvantrai': se

In [25]:
street_type_re  = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_type_pre = re.compile(r'^[NSEW]\b\.?', re.IGNORECASE)

In [26]:
def string_case(s): # change string into titleCase except for UpperCase
    if s.isupper():
        return s
    else:
        return s.title()

In [27]:
# return the updated names
def update_name(name, mapping):
    name = name.split(' ')
    for i in range(len(name)):
        if name[i] in mapping:
            name[i] = mapping[name[i]]
            name[i] = string_case(name[i])
        else:
            name[i] = string_case(name[i])
    
    name = ' '.join(name)
   

    return name

update_street = audit(OSMFILE_sample) 

In [28]:
# print the updated names
for street_type, ways in update_street.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name  

GVMM => GVMM
Service road - Sarkhej Gandhinagar Highway => Service Road - Sarkhej Gandhinagar Highway
Danapith Road => Danapith Road
132 Ft. Ring Road => 132 Feet Ring Road
Netaji Subhash Chandra Road => Netaji Subhash Chandra Road
Kh-0 => Kh-0
Nehrunagar => Nehrunagar
Khokhra Road => Khokhra Road
Odhav Road => Odhav Road
SH-41 => SH-41
GST Crossing Road, Ranip => GST Crossing Road, Ranip
GST Crossing => GST Crossing
GST Road => GST Road
GST Crossing Road, New Ranip => GST Crossing Road, New Ranip
GST Crossing, New Ranip => GST Crossing, New Ranip
Swastik Cross Road => Swastik Cross Road
Swastik Society Road => Swastik Society Road
Nav-sarjan School Road => Nav-Sarjan School Road
Ellisbridge => Ellisbridge
Vrajraiji Colony => Vrajraiji Colony
Manek Baug Road => Manek Baug Road
Gandhinagar-Ahmedabad Highway => Gandhinagar-Ahmedabad Highway
8 aasiyana society jivrajpark nr 132 ring road ahmedabad => 8 Aasiyana Society Jivrajpark Nr 132 Ring Road Ahmedabad
kankariya gate no.3 => Kankariya

## Problem with the Data - Postal Codes 

In [29]:
filename = open("ahmedabad.osm", "r")

zip_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

zip_types = defaultdict(set)

expected_zip = {}

def audit_zip_codes(zip_types, zip_name, regex, expected_zip):
    m = regex.search(zip_name)
    if m:
        zip_type = m.group()
        if zip_type not in expected_zip:
             zip_types[zip_type].add(zip_name)

def is_zip_name(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(filename, regex):
    for event, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag == "way" or elem.tag == "node":
            for tag in elem.iter("tag"):
                if is_zip_name(tag):
                    audit_zip_codes(zip_types, tag.attrib['v'], regex, expected_zip)
    pprint.pprint(dict(zip_types))

audit(filename, zip_type_re)

{'061': set(['380 061']),
 '3': set(['3']),
 '33026': set(['33026']),
 '380001': set(['380001']),
 '380003': set(['380003']),
 '380004': set(['380004']),
 '380005': set(['380005']),
 '380006': set(['380006']),
 '380007': set(['380007']),
 '380008': set(['380008']),
 '380009': set(['380009']),
 '380013': set(['380013']),
 '380014': set(['380014']),
 '380015': set(['380015']),
 '380021': set(['380021']),
 '380023': set(['380023']),
 '380024': set(['380024']),
 '380026': set(['380026']),
 '380027': set(['380027']),
 '380028': set(['380028']),
 '380043': set(['380043']),
 '380051': set(['380051']),
 '380052': set(['380052']),
 '380054': set(['380054']),
 '380058': set(['380058']),
 '380059': set(['380059']),
 '380060': set(['380060']),
 '380061': set(['380061']),
 '380063': set(['380063']),
 '382006': set(['382006']),
 '382007': set(['382007']),
 '382009': set(['382009']),
 '382210': set(['382210']),
 '382345': set(['382345']),
 '382350': set(['382350']),
 '382405': set(['382405']),
 '3824

In [30]:
for zip_type, ways in zip_types.iteritems(): 
        for name in ways:
            if "-" in name:
                name = name.split("-")[0].strip()
            if "AZ" in name:
                name = name.split("AZ")[1].strip('AZ ')
            print name

380013
380014
380015
382418
382481
382009
382475
382007
382006
380007
380006
3
382345
380061
382480
380005
380004
380003
380001
380043
380009
380008
380 061
382421
380028
382424
380024
380027
380026
380021
380023
382445
382440
382405
382210
380060
382350
380058
380059
380054
33026
380051
380052
380063


## Problem with the Data - Phone Numbers

In [31]:
filename = open("ahmedabad.osm", "r")

phone_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

phone_types = defaultdict(set)

expected_zip = {}

def audit_phone_num(phone_types, phone_num, regex, expected_phone):
    m = regex.search(phone_num)
    if m:
        phone_type = m.group()
        if phone_type not in expected_zip:
             phone_types[phone_type].add(phone_num)

def is_phone_num(elem):
    return (elem.attrib['k'] == "phone")


def audit(filename, regex):
    for event, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag == "way" or elem.tag == "node":
            for tag in elem.iter("tag"):
                if is_phone_num(tag):
                    audit_phone_num(phone_types, tag.attrib['v'], regex, expected_zip)
    pprint.pprint(dict(phone_types))

audit(filename, phone_type_re)

{'0200': set(['+91793013 0200']),
 '0201': set(['079 6619 0201']),
 '0228': set(['0792740 0228']),
 '0500/05/06/07/08/09': set(['+91 79 6190 0500/05/06/07/08/09']),
 '079)39830036/37': set(['(079)39830036/37']),
 '07922720605': set(['07922720605']),
 '07922912990': set(['07922912990']),
 '07925500007': set(['07925500007']),
 '07926304000': set(['07926304000']),
 '07926306752': set(['07926306752']),
 '07926582130': set(['07926582130']),
 '07927641100': set(['07927641100']),
 '07965422223': set(['07965422223']),
 '09016861000': set(['09016861000']),
 '19151': set(['+91 93776 19151']),
 '22239': set(['099099 22239']),
 '2386': set(['079 2687 2386']),
 '25556767': set(['+91 79 25556767']),
 '26920057': set(['079 26920057']),
 '29705588': set(['+91 79 29705588']),
 '30912345': set(['+91 79 30912345']),
 '38242': set(['093270 38242']),
 '4032-7226': set(['(+91-79) 4032-7226']),
 '41132': set(['+91 98250 41132']),
 '5050': set(['079 4050 5050']),
 '5151': set(['+91 79 6651 5151']),
 '5222': s

In [32]:
for phone_type, ways in phone_types.iteritems():
    for name in ways:
        if "+1 " in name:
            name = name.split("+1 ")[1].strip('+1 ')
        if "+" in name:
            name = name.split("+")[1].strip('+')
        if ";" in name:
            name = name.split(";")[0].strip()
        if name.startswith ("1-"): 
            name = name.strip("1-")
        if name.startswith ("1 "):
            name = name.strip("1 ")
        if "-" in name:
            name = name.replace("-", " ")
        if "(" in name:
            name = name.replace("(", "")
        if ")" in name:
            name = name.replace(")", "")
        if "." in name:
            name = name.replace(".", " ")
        if name.startswith("01"):
            name = name.strip("01")
        if name.startswith("Phone number "):
            name = name.strip("Phone number")
        if name.startswith("1 "):
            name = name.strip("1 ")
        if len(name) < 12:
            only_numbers = re.sub(r'\D', "", name)
            name = only_numbers[0:3] + " " + only_numbers[3:6] + " " + only_numbers[6:]
        if name.startswith(" "):
            name = name.replace(" ", "")
        if "x1" in name:
            name = name.strip("x1")
    
        print name

0792740 0228
917922864345
917922700585
917965469992
099099 22239
090 168 61000
079 263 06752
91 79 40502232
079 654 22223
792 662 0059
079 229 12990
91 79 6651 5151
917927472043
91 99 98 264810
917801949128
937 577 6800
079 4050 5050
709 680 5450
079 265 82130
91 79 6190 0500/05/06/07/08/09
919375565533
91 79 26401554
9179 2657 8369
91 79 4032 7226
91 79 2657 7621
91 79 25556767
91793013 0200
079 2687 2386
91 94262 84715
91 93776 19151
917926314000
91 8758637922
91 79 29705588
917923224006
079 6619 0201
079 255 00007
079 227 20605
919879566257
07939830036/37
91 98250 41132
91 9054876866
917927506819
917927550875
093270 38242
990 900 5694
915 752 790
079 276 41100
91 79 2657 5741
91 79 2656 5222
079 263 04000
855 553 4767
91 79 2646 6464
917922167530
91 79 2589 4542 / 
919099958936
91 79 2550 7181
079 26920057
91 79 30912345


## Problematic Tags

In [33]:
filename = open("ahmedabad.osm", "r")

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k = element.attrib['k']
        if re.search(lower, k):
            keys["lower"] += 1
        elif re.search(lower_colon, k):
            keys["lower_colon"] += 1
        elif re.search(problemchars, k):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
            
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

process_map(filename)


{'lower': 97364, 'lower_colon': 2004, 'other': 34, 'problemchars': 7}