In [1]:
import xml.etree.cElementTree as ET
import pprint
from datetime import datetime
import csv
import requests
import sys
import os
import time

### Part 1 : Getting the dataset 

Download the map dataset using Overpass API. We will use python request module to query overpass API to download the map dataset.

In [43]:
START_LAT = 51.54
END_LAT = 51.55

START_LON = -0.05
END_LON = 0.05

URL = 'http://overpass-api.de/api/map?bbox={},{},{},{}'.format(START_LON, START_LAT, END_LON, END_LAT)

In [44]:
DIR = '/Users/sourabh/Desktop/Udacity/DAND/P4 - Data Wrangling/project/P4_OSM_Data_Wrangling'
filename='map_data_sample.osm'
FILENAME = os.path.join(DIR, filename)

In [45]:
def download_osm_data(URL, filename, stream=True, timeout=30):
    r = requests.get(URL, stream=stream, timeout=timeout)
    
    print "Requesting URL :: ", r.url 

    if r.status_code == 200:
        print 'Request successful !!!'
        with open(filename, 'wb') as f:
            print "Downloading data to file."
            i = 0
            for chunk in r.iter_content(chunk_size=1024*1024):
                if chunk:
                    i += 1
                    f.write(chunk)
                    sys.stdout.write("\r%s" % ( ('.' * i) ) )    
                    sys.stdout.flush()

        print '\nDownload finished. \n{} is ready.'.format(filename)
        print 'File Size :: {} MB'.format(round(os.path.getsize(FILENAME) / (1024.0 * 1024), 3) )
    else:
        print "Bad Request...\n\n", r.content


In [46]:
download_osm_data(URL=URL, filename=FILENAME)

Requesting URL ::  http://overpass-api.de/api/map?bbox=-0.05,51.54,0.05,51.55
Request successful !!!
Downloading data to file.
.....................................................................................................................
Download finished. 
/Users/sourabh/Desktop/Udacity/DAND/P4 - Data Wrangling/project/P4_OSM_Data_Wrangling/map_data_sample.osm is ready.
File Size :: 8.148 MB


In [47]:
def sort(data, reverse = False):
    """
    This function will sort a dictionary contents according to values in ASC or DESC order.
    """
    for key, value in data.items():
        return { key: [(kk, vv) for vv, kk in sorted([(v, k) for k, v in value.items()], reverse=reverse)] }
        

def gather_element_counts_levels(root, level=0, tag_counts={}, tag_levels={}, attrib_counts={}):
    """
    This is a utility function that will recursively get the root element of the
    XML file and will find all the tags being used in the XML. It will
    find the level of each tag(root tag being at level 1), and also count
    the number of times the tags appeared.

    :param root: Root element of the XML
    :param level: Level of the root
    :param tags: A dictionary object representing tag counts
    :param levels: A dictionary object representing level of each tag

    :return: A list object containing the root element, tag counts & levels
    """
    tag_levels[root.tag] = level+1
    for child in root.getchildren():
        tag = child.tag
        for attrib in child.attrib:
            attrib = tag + ":" + attrib
            attrib_counts[attrib] = attrib_counts.get(attrib, 0) + 1
        #if tag in tag_counts:
        #    tag_counts[tag] += 1
        #else:
        #    tag_counts[tag] = 1
        tag_counts[tag] = tag_counts.get(tag, 0) + 1
        gather_element_counts_levels(child, level+1, tag_counts, tag_levels, attrib_counts)
    return [{"root": root.tag}, {"tag_counts": tag_counts}, {"tag_levels": tag_levels}, {"attrib_counts": attrib_counts}]


def get_XML_stats(xmltree):
    """
    This function will gather basic statistics for the input XML file. This includes
    getting the root tag, and finding all the tags being used with the levels & counts.
    :param filename: Input OSM XML file to parse
    :return: dictionary containing basic statistics like tag counts, root tag etc.
    """
    start = time.time()
    root = xmltree.getroot()
    data = gather_element_counts_levels(root)
    
    data[1] = sort(data[1], reverse=True)
    data[2] = sort(data[2])
    data[3] = sort(data[3], reverse=True)
    
    print 'Run time to extract statistics :: {} seconds'.format(round(time.time() - start), 3)
    return data


In [104]:
start = time.time()
xmltree = ET.parse(FILENAME)
print 'Time to read the XML file :: {} seconds'.format(round(time.time() - start), 4)

Time to read the XML file :: 0.0 seconds


In [105]:
stats = get_XML_stats(xmltree)

Run time to extract statistics :: 1.0 seconds


In [106]:
pprint.pprint(stats)

[{'root': 'osm'},
 {'tag_counts': [('nd', 73782),
                 ('node', 56894),
                 ('tag', 56720),
                 ('member', 54842),
                 ('way', 9378),
                 ('relation', 560),
                 ('note', 2),
                 ('meta', 2),
                 ('bounds', 2)]},
 {'tag_levels': [('osm', 1),
                 ('bounds', 2),
                 ('meta', 2),
                 ('node', 2),
                 ('note', 2),
                 ('relation', 2),
                 ('way', 2),
                 ('member', 3),
                 ('nd', 3),
                 ('tag', 3)]},
 {'attrib_counts': [('nd:ref', 73782),
                    ('node:version', 56894),
                    ('node:user', 56894),
                    ('node:uid', 56894),
                    ('node:timestamp', 56894),
                    ('node:lon', 56894),
                    ('node:lat', 56894),
                    ('node:id', 56894),
                    ('node:changeset', 56894

### Audit the data

##### Verify latitude & longitudes are within bound

In [107]:
# Check the lat & lon boundaries
print xmltree.find("bounds").attrib.values()
print [START_LAT, END_LON, START_LON, END_LAT]

['51.54', '0.05', '-0.05', '51.55']
[51.54, 0.05, -0.05, 51.55]


In [130]:
def almost_within(num, lower, upper, tolerance=0.005):
    return (num >= lower-tolerance and num <= upper+tolerance)

start = time.time()
node_IDs_to_remove = []
count = 0
for node in xmltree.findall("node"):
    attribs = node.attrib
    lat, lon = float(attribs["lat"]), float(attribs["lon"])
    if not (almost_within(lat, START_LAT, END_LAT) or almost_within(lon, START_LON, END_LON) ):
        ID = attribs["id"]
        print "Outside Latitude & Longitude boundary --- {}, [{}, {}]".format(ID, lat, lon)
        
        for child in node:
            print child.tag, ' ---> ', child.attrib
        
        node_IDs_to_remove.append(ID)
        
        #xpath_expression_nd = ".//nd[@ref='{}']".format(ID)
        #xpath_expression_nd_parent = ".//nd[@ref='{}']/..".format(ID)
        #xpath_expression_member = ".//member[@ref='{}']".format(ID)
        #xpath_expression_member_parent = ".//member[@ref='{}']/..".format(ID)
        
        #print xmltree.find(xpath_expression_nd)
        #print xmltree.find(xpath_expression_nd_parent)
        #print xmltree.find(xpath_expression_member)
        #print xmltree.find(xpath_expression_member_parent)
        
        count += 1

print count
print 'Time take :: {} seconds'.format(round(time.time() - start, 3) )

Outside Latitude & Longitude boundary --- 26244916, [51.5269312, 0.1371486]
Outside Latitude & Longitude boundary --- 129650696, [51.531186, 0.1023248]
tag  --->  {'k': 'name', 'v': 'CTRL Wayside Ventilation Shaft'}
tag  --->  {'k': 'railway', 'v': 'ventilation_shaft'}
Outside Latitude & Longitude boundary --- 368002355, [51.5211088, -0.0787646]
Outside Latitude & Longitude boundary --- 803879504, [51.5330165, 0.0952219]
Outside Latitude & Longitude boundary --- 803879507, [51.5344717, 0.0911331]
Outside Latitude & Longitude boundary --- 1541138323, [51.5272155, 0.134524]
Outside Latitude & Longitude boundary --- 1637627153, [51.5188379, -0.0818524]
Outside Latitude & Longitude boundary --- 1637627159, [51.5228322, -0.0761866]
Outside Latitude & Longitude boundary --- 1637627160, [51.5231808, -0.0755819]
Outside Latitude & Longitude boundary --- 1637627172, [51.5248124, -0.0719266]
Outside Latitude & Longitude boundary --- 1637627179, [51.5252803, -0.0709441]
Outside Latitude & Longitu

In [131]:
node_IDs_to_remove

['26244916',
 '129650696',
 '368002355',
 '803879504',
 '803879507',
 '1541138323',
 '1637627153',
 '1637627159',
 '1637627160',
 '1637627172',
 '1637627179',
 '1637627181',
 '1637627182',
 '1637627184',
 '1655439982',
 '1655439988',
 '1725575333',
 '1725576189',
 '1725576194',
 '1725576214',
 '1725576218',
 '1725576223',
 '1725576227',
 '1725576231',
 '1725576234',
 '1725576239',
 '1725576241',
 '1725576243',
 '1725576247',
 '1725576277',
 '1725576281',
 '1725576285',
 '1725576289',
 '1725576291',
 '2082501636',
 '2130595604',
 '2130595606',
 '2130595610',
 '2130595611',
 '2130595612',
 '2130595616',
 '2130595617',
 '2130595618',
 '2130595623',
 '2130595625',
 '2130595626',
 '2130595629',
 '2130595631',
 '2130595632',
 '2130595635',
 '2130595691',
 '2130595699',
 '2130595700',
 '2130595705',
 '2130595738',
 '2130595740',
 '2130595745',
 '2130595751',
 '2130595757',
 '2130595759',
 '2130595763',
 '2130595766',
 '2130595793',
 '2130595797',
 '2130595807',
 '2130595809',
 '2130595813',
 

In [133]:
for _, elem in ET.iterparse(FILENAME):
    if elem.tag == "node":
        if elem.attrib["id"] in node_IDs_to_remove:
            elem.remove()

{'changeset': '41755612', 'uid': '550203', 'timestamp': '2016-08-28T13:09:32Z', 'lon': '0.0060879', 'version': '3', 'user': 'RoverPuppy', 'lat': '51.5463538', 'id': '107284'}


In [135]:
from lxml import etree as le

In [139]:
doc = le.parse(FILENAME)

In [146]:
for elem in doc.iter():
    if elem.tag == "node":
        print elem
        if elem.attrib["id"] == "3942447676":
            print elem
    break

In [147]:
for elem in doc.iter():
    if elem.tag == "node":
        print elem
        if elem.attrib["id"] == "3942447676":
            print elem
    break