### mapparser.py

In [4]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict


def count_tags(filename):
    osm_file = open(filename, "r")
    tags = defaultdict(int)
    for event, elem in ET.iterparse(osm_file):
        tag = elem.tag
        tags[tag] += 1
    return tags


tags = count_tags('cincinnati_ohio.osm')
pprint.pprint(dict(tags))

{'bounds': 1,
 'member': 14855,
 'nd': 2450908,
 'node': 1985255,
 'osm': 1,
 'relation': 1176,
 'tag': 765119,
 'way': 270492}


In [2]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict

def node_tag_type(filename):
    node_tags = defaultdict(int)
    for _, element in ET.iterparse(filename):
        if element.tag == "node":
            for child in element:
                if child.tag == "tag":
                    tag = child.attrib["k"]
                    node_tags[tag] += 1
    return node_tags

node_tags = node_tag_type('cincinnati_ohio.osm')
print "Number of distinct node tags: ", len(node_tags)
pprint.pprint(dict(node_tags))

Number of distinct node tags:  316
{'FIXME': 20,
 'USEPA': 1,
 'abandoned:railway': 1,
 'access': 132,
 'addr:city': 153,
 'addr:country': 35,
 'addr:floor': 1,
 'addr:full': 11,
 'addr:housename': 7,
 'addr:housenumber': 221,
 'addr:postcode': 173,
 'addr:state': 287,
 'addr:street': 231,
 'addr:suite': 1,
 'addr:unit': 3,
 'aeroway': 35,
 'aeroway:historic': 7,
 'alt_name': 50,
 'alt_name:es': 1,
 'alt_name:zh': 1,
 'amenity': 2856,
 'amenity:coming_soon': 1,
 'amenity:historic': 399,
 'artwork:type': 1,
 'artwork_type': 4,
 'atm': 8,
 'attribution': 7,
 'backrest': 41,
 'barrier': 297,
 'bell:diameter': 1,
 'bell:service_times': 1,
 'bell:weight': 1,
 'bench': 1,
 'bicycle': 12,
 'bicycle_parking': 18,
 'bollard': 6,
 'brand': 19,
 'building': 100,
 'building:historic': 5,
 'building:levels': 3,
 'building:part': 2,
 'bus': 1,
 'canoe': 2,
 'capacity': 80,
 'cargo': 2,
 'census:population': 91,
 'clothes': 1,
 'club': 1,
 'collection_times': 1,
 'colour': 5,
 'communication:mobile_p

In [5]:
def way_tag_type(filename):
    way_tags = defaultdict(int)
    for _, element in ET.iterparse(filename):
        if element.tag == "way":
            for child in element:
                if child.tag == "tag":
                    tag = child.attrib["k"]
                    way_tags[tag] += 1
    return way_tags

way_tags = way_tag_type('cincinnati_ohio.osm')
print "Number of distinct way tags: ", len(way_tags)
pprint.pprint(dict(way_tags))

Number of distinct way tags:  460
{'FIXME': 49,
 'FIXME:railway': 1,
 'FIXME:ref': 9,
 'HFCS': 1,
 'NHS': 293,
 'abandoned': 1,
 'abandoned:highway': 6,
 'abandoned:service': 1,
 'abutters': 65,
 'access': 4386,
 'access_control': 112,
 'accuracy:meters': 1,
 'addr:city': 7217,
 'addr:country': 3826,
 'addr:county': 228,
 'addr:door': 1,
 'addr:full': 12,
 'addr:housename': 28,
 'addr:housenumber': 974,
 'addr:interpolation': 11,
 'addr:postcode': 6842,
 'addr:state': 6293,
 'addr:street': 7540,
 'addr:subdistrict': 1,
 'admin_level': 765,
 'advertising': 1,
 'aeroway': 269,
 'aeroway:disused': 2,
 'aeroway:historic': 3,
 'agricultural': 1,
 'alt': 1,
 'alt_name': 1213,
 'alt_name:en': 2,
 'alt_name:es': 1,
 'alt_name:historic': 2,
 'alt_name:vi': 1,
 'altccess': 1,
 'amenity': 5409,
 'amenity:historic': 52,
 'animal': 1,
 'area': 394,
 'atm': 28,
 'attraction': 58,
 'attribution': 37,
 'automated': 2,
 'barrier': 582,
 'baseball': 1,
 'basin': 2,
 'bicycle': 1765,
 'boat': 254,
 'bord

In [7]:
def relation_tag_type(filename):
    rel_tags = defaultdict(int)
    for _, element in ET.iterparse(filename):
        if element.tag == "relation":
            for child in element:
                if child.tag == "tag":
                    tag = child.attrib["k"]
                    rel_tags[tag] += 1
    return rel_tags

rel_tags = relation_tag_type('cincinnati_ohio.osm')
print "Number of distinct relation tags: ", len(rel_tags)
pprint.pprint(dict(rel_tags))

#from collections import Counter
#pprint.pprint(dict(Counter(rel_tags).most_common(25)))

Number of distinct relation tags:  428
{'FIXME': 11,
 'ISO3166-1': 1,
 'ISO3166-1:alpha2': 1,
 'ISO3166-1:alpha3': 1,
 'ISO3166-1:numeric': 1,
 'ISO3166-2': 3,
 'NHS': 1,
 'access': 5,
 'addr:city': 15,
 'addr:country': 6,
 'addr:housenumber': 13,
 'addr:postcode': 14,
 'addr:state': 12,
 'addr:street': 13,
 'admin_level': 157,
 'alt_name': 16,
 'alt_name:es': 1,
 'alt_name:vi': 3,
 'amenity': 93,
 'area': 12,
 'attribution': 9,
 'boat': 2,
 'border_type': 149,
 'born': 1,
 'boundary': 184,
 'brand': 1,
 'building': 263,
 'building:colour': 1,
 'building:height': 9,
 'building:height:underground': 1,
 'building:levels': 21,
 'building:levels:underground': 1,
 'building:material': 6,
 'building:part': 1,
 'capacity': 1,
 'census:population': 1,
 'colour': 21,
 'construction': 1,
 'created_by': 18,
 'cycle_network': 11,
 'denomination': 12,
 'description': 3,
 'destination': 1,
 'died': 1,
 'ele': 41,
 'emergency': 1,
 'etymology': 3,
 'fee': 2,
 'flag': 1,
 'foot': 5,
 'from': 1,
 'gnis

### users.py

In [36]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict

def get_user(element):
    return element.attrib["uid"]

def process_map(filename):
    users = defaultdict(int)
    for _, element in ET.iterparse(filename):
        if element.tag == "node" or element.tag == "way":
            user = get_user(element)
            users[user] += 1
    return users


users = process_map('cincinnati_ohio.osm')
user_count = len(users)
print "Number of distinct users: ", user_count
pprint.pprint(dict(users))

Number of distinct users:  565
{'1007528': 6,
 '1012362': 444,
 '103253': 8,
 '1047195': 17,
 '104962': 27,
 '1051550': 34,
 '105946': 3,
 '10786': 18,
 '1081635': 46,
 '1087647': 2,
 '108781': 772,
 '1096567': 2,
 '109661': 117,
 '110263': 1,
 '1108251': 237463,
 '11126': 1,
 '1126944': 81,
 '1149057': 17,
 '115918': 1,
 '118021': 6,
 '118856': 1,
 '1195104': 8,
 '1198074': 4,
 '119881': 4,
 '120146': 11835,
 '1203657': 1,
 '120468': 69,
 '1205642': 8,
 '1207672': 2643,
 '1209932': 204,
 '121241': 1144,
 '1214881': 44,
 '1215563': 58,
 '1219059': 432,
 '1219875': 5,
 '1224467': 1,
 '123537': 4,
 '1236137': 4,
 '1240849': 140,
 '12434': 2,
 '1246157': 3,
 '1250514': 1,
 '1260280': 18,
 '1261813': 4,
 '128017': 22,
 '129255': 2,
 '130065': 5,
 '130472': 42,
 '1306': 10,
 '131059': 16,
 '1329572': 18,
 '1330847': 4,
 '135163': 7,
 '1352438': 7,
 '1376118': 2,
 '137673': 1,
 '13832': 10,
 '13834': 97,
 '1408522': 22,
 '1410107': 5,
 '1436654': 1,
 '1436909': 16,
 '14390': 9,
 '1442206': 2

### tags.py

In [13]:
import xml.etree.cElementTree as ET
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


# check the "k" value for each "<tag>" and see if they can be 
# valid keys in MongoDB, as well as see if there are any other 
# potential problems.
def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value):
            keys["lower"] += 1
        elif lower_colon.search(k_value):
            keys["lower_colon"] += 1
        elif problemchars.search(k_value):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1        
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys


keys = process_map('cincinnati_ohio.osm')
pprint.pprint(keys)

{'lower': 458445, 'lower_colon': 294301, 'other': 12373, 'problemchars': 0}


### audit.py

In [56]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "cincinnati_ohio.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
# matches strings at the end ($)
# s.t. the position is at a word boundary (\b),
# does not includ whitespace chars (\S),
# and there may or may not be a '.' at the end


expected_st_types = ["Alley", "Avenue", "Boulevard", "Circle", "Court", "Drive",
            "Lane", "Pike", "Parkway", "Place", "Point", "Road", "Run",
            "Street", "Terrace", "Trail", "Way"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected_st_types:
            street_types[street_type].add(street_name)


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    street_names = defaultdict(int)
    states = defaultdict(int)
    postcodes = defaultdict(int)
    cities = defaultdict(int)
    
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == "addr:street":
                    audit_street_type(street_types, tag.attrib['v'])
                    name = tag.attrib['v']
                    street_names[name] += 1
                    
                if tag.attrib['k'] == "addr:state":
                    state = tag.attrib['v']
                    states[state] += 1
                    
                if tag.attrib['k'] == "addr:postcode":
                    postcode = tag.attrib['v']
                    postcodes[postcode] += 1
                
                if tag.attrib['k'] == "addr:city":
                    city = tag.attrib['v']
                    cities[city] += 1
                    
    return street_types, street_names, states, postcodes, cities


st_types, st_names, states, postcodes, cities = audit(OSMFILE)

In [57]:
pprint.pprint(dict(st_types))
pprint.pprint(dict(st_names))

{'276': set(['State Route 276']),
 '28': set(['Old State Route 28', 'State Route 28']),
 '3': set(['West United States Highway 22 and 3']),
 '42': set(['Route 42', 'State Route 42', 'U.S. Highway 42']),
 '48': set(['Ohio 48']),
 '50': set(['US 50']),
 '63': set(['State Route 63']),
 'A': set(['US HWY  42 #A']),
 'Ashley': set(['Ashley']),
 'Ave': set(['Kenard Ave', 'W Clifton Ave', 'Whetsel Ave']),
 'Avene': set(['Ludlow Avene']),
 'Avnue': set(['South Fort Thomas Avnue']),
 'Bend': set(['River Bend']),
 'Birch': set(['Birch']),
 'Breezes': set(['Soaring Breezes']),
 'Cove': set(['Timber Cove']),
 'Crest': set(['Fairway Crest']),
 'Crossing': set(['Courtyard Crossing', 'North Bend Crossing']),
 'Dr': set(['Beckett Center Dr',
            'Cincinnati Mills Dr',
            'Clough Woods Dr',
            'Genntown Dr']),
 'Edge': set(['Waters Edge']),
 'Farm': set(['Stonebridge Farm']),
 'Highway': set(['Dixie Highway']),
 'Hill': set(['Liberty Hill']),
 'Hollow': set(['Windy Hollow']),


In [58]:
print "STATES:"
pprint.pprint(dict(states))
print "Postal Codes:"
pprint.pprint(dict(postcodes))
print "CITIES:"
pprint.pprint(dict(cities))

STATES:
{'FL': 1,
 'IN': 4,
 'KY': 87,
 'Kentucky': 1,
 'OH': 6365,
 'OH - Ohio': 1,
 'OHIO': 1,
 'Oh': 2,
 'Ohio': 8,
 'Ohop': 1,
 'ky': 2,
 'ohio': 1}
Postal Codes:
{'33321': 1,
 '41005': 10,
 '41011': 8,
 '41014': 2,
 '41015': 22,
 '41017': 14,
 '41018': 26,
 '41042': 18,
 '41051': 3,
 '41071': 4,
 '41073': 6,
 '41075': 7,
 '41076': 3,
 '41080': 1,
 '41091': 3,
 '41094': 2,
 '45001': 1,
 '45002': 47,
 '45011': 5,
 '45014-4108': 3,
 '45036': 3,
 '45039': 6,
 '45040': 15,
 '45067': 3,
 '45069': 8,
 '45101': 1,
 '45102': 1,
 '45103': 8,
 '45103-9707': 1,
 '45140': 5773,
 '4515': 1,
 '45150': 48,
 '45160': 9,
 '45202': 377,
 '45203': 4,
 '45204': 3,
 '45205': 3,
 '45206': 11,
 '45208': 58,
 '45208-2017': 1,
 '45208-2101': 2,
 '45209': 12,
 '45211': 90,
 '45212': 1,
 '45214': 1,
 '45215': 8,
 '45217': 1,
 '45218': 1,
 '45219': 58,
 '45220': 17,
 '45220-1405': 1,
 '45223': 15,
 '45223-1806': 1,
 '45224': 2,
 '45225': 3,
 '45226': 2,
 '45227': 4,
 '45229': 1,
 '45230': 1,
 '45231': 1,
 '45

In [59]:
street_mapping = {"Ave": "Avenue",
           "Avene": "Avenue",
           "Avnue": "Avenue",
           "avenue": "Avenue",
           "State" : "Avenue",
           "Dr" : "Drive",
           "dr." : "Drive",
           "meadow" : "Meadow Drive",
           "Rd.": "Road",
           "Rd": "Road",
           "St": "Street",
           "street" : "Street",
           "ter" : "Terrace"}

def update_name(name, mapping):
    for key in mapping:
        if name.endswith(key):
            type_start = name.find(key)
            name = name[:type_start] + mapping[key]
    return name

In [60]:
def test():
    #st_types, st_names, states, postcodes, cities = audit(OSMFILE)
    #pprint.pprint(dict(st_types))
    print "UPDATED NAMES:"

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, street_mapping)
            if better_name != name:
                print name, "=>", better_name
            
test()

UPDATED NAMES:
Kemper meadow => Kemper Meadow Drive
Princeton Rd => Princeton Road
Bridgetown Rd => Bridgetown Road
Ritchie State => Ritchie Avenue
firshade ter => firshade Terrace
Montgomery Rd. => Montgomery Road
Hamilton avenue => Hamilton Avenue
Beckett Center Dr => Beckett Center Drive
Clough Woods Dr => Clough Woods Drive
Cincinnati Mills Dr => Cincinnati Mills Drive
Genntown Dr => Genntown Drive
215 Calhoun St => 215 Calhoun Street
Vine St => Vine Street
Clark St => Clark Street
Greenup St => Greenup Street
Chickasaw St => Chickasaw Street
South Fort Thomas Avnue => South Fort Thomas Avenue
Ludlow Avene => Ludlow Avenue
vine street => vine Street
Whetsel Ave => Whetsel Avenue
W Clifton Ave => W Clifton Avenue
Kenard Ave => Kenard Avenue
Kemper Meadow dr. => Kemper Meadow Drive


In [61]:
state_mapping = {'Oh': "OH", 
                 'oh': "OH", 
                 'OHIO': "OH", 
                 'Ohop': "OH", 
                 'ohio': "OH", 
                 'Ohio': "OH", 
                 'OH - Ohio': "OH", 
                 'Kentucky': "KY", 
                 'ky': "KY"}

def update_state(name, mapping):
    # change state name to 2-letter code
    for key in mapping:
        if name == key:
            name = mapping[key]
    return name

city_mapping = { 'CIncinnati': "Cincinnati", 
                 'cincinnati': "Cincinnati", 
                 'Cincinnati, OH': "Cincinnati", 
                 'Cincinnati, Ohio': "Cincinnati", 
                 'Cincinnati, Oh': "Cincinnati", 
                 'Cincinnati, oh': "Cincinnati", 
                 'Cincinnati; Symmes': "Symmes Township",
                 'Cincinnati;Blue Ash': "Blue Ash", 
                 'Cincinnati;Symmes': "Montgomery",
                 'Newport, Kentucky': "Newport",
                 'forest Park': "Forest Park",
                 'fort thomas': "Fort Thomas" }


def update_city(name, mapping):
    # fix inconsistent or incorrect city identifiers
    for key in mapping:
        if name == key:
            name = mapping[key]
    return name

def update_postcode(code):
    # strip suffix of postal code if present
    if len(code) > 5:
        return code[:5]
    else:
        return code

## clean_cinci.py

In [None]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

    
def has_addr(element):
    for child in element:
        if child.tag == "tag":
            k = child.attrib["k"]
            if k.find("addr") != -1:
                return True
    return False


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        attributes = element.attrib
        node["type"] = element.tag
        node["created"] = {}
        
        if "lat" in attributes and "lon" in attributes:
            node["pos"] = [ float(attributes["lat"]), float(attributes["lon"]) ]
        if has_addr(element):
            node["address"] = {}
        
        for key in attributes:
            if key in CREATED:
                node["created"][key] = attributes[key]
            elif key != "lat" and key != "lon":
                node[key] = attributes[key]
        
        if element.findall("tag"):
            for child in element.iter("tag"):
                k = child.attrib["k"]
                v = child.attrib["v"]
                
                if k.find("addr") != -1:
                    new_key = k[5:]
                    if new_key.find(":") != -1:
                        pass
                    elif new_key == "street":
                        v = update_name(v, street_mapping)
                    elif new_key == "state":
                        v = update_state(v, state_mapping) 
                    elif new_key == "postcode":
                        v = update_postcode(v)
                    elif new_key == "city":
                        comma = v.find(",")
                        if comma != -1:
                            state = v[comma + 2:]
                            state = update_state(state, state_mapping)
                            node["address"]["state"] = state
                        v = update_city(v, city_mapping)
                    node["address"][new_key] = v
                elif problemchars.search(k):
                    pass
                else:
                    node[k] = v
        
        if element.tag == "way":
            node["node_refs"] = []
            if element.findall("nd"):
                for child in element.iter("nd"):
                    value = child.attrib["ref"]
                    node["node_refs"].append(value)
                    
        return node
    else:
        return None



def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        # get an iterable & turn it into an iterator
        context = ET.iterparse(file_in, events=("start", "end"))
        context = iter(context)
        # get the root element
        event, root = context.next()

        for event, element in context:
            el = shape_element(element)
            if event == "end" and el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
                root.clear()
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('sample.osm', pretty=False)
    pprint.pprint(data)

if __name__ == "__main__":
    test()

[{'created': {'changeset': '12642091',
              'timestamp': '2012-08-07T08:15:12Z',
              'uid': '33757',
              'user': 'Minh Nguyen',
              'version': '17'},
  'id': '60513020',
  'pos': [39.4307416, -84.341689],
  'type': 'node'},
 {'created': {'changeset': '757389',
              'timestamp': '2009-03-08T10:26:19Z',
              'uid': '47647',
              'user': 'drewp',
              'version': '18'},
  'id': '60532208',
  'pos': [39.3083765, -84.436355],
  'type': 'node'},
 {'created': {'changeset': '961717',
              'timestamp': '2009-04-25T23:27:07Z',
              'uid': '33757',
              'user': 'Minh Nguyen',
              'version': '28'},
  'id': '60532283',
  'pos': [39.1563554, -84.5367628],
  'type': 'node'},
 {'created': {'changeset': '27784422',
              'timestamp': '2014-12-29T19:08:09Z',
              'uid': '1108251',
              'user': 'Nate_Wessel',
              'version': '4'},
  'id': '75422682',
  'pos': [

In [32]:
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.course

docs = db.cinci.find().count()
nodes = db.cinci.find({ "type" : "node" }).count()
ways = db.cinci.find({ "type" : "way" }).count()

print "Number of documents: ", docs
print "Number of nodes: ", nodes
print "Number of  ways: ", ways

Number of documents:  2255747
Number of nodes:  1985219
Number of  ways:  270488


In [26]:
user_list = db.cinci.distinct("created.user")
print "Number of distinct users:", len(user_list)

Number of distinct users: 565


###Exploration of tags

In [54]:
pipeline = [
    { "$match" : { "highway" : { "$exists" : 1 } } },
    { "$group" : { "_id" : "$highway",
                   "count" : { "$sum" : 1 } } },
    { "$sort" : { "count" : -1 } },
    { "$limit" : 15 }
]
highway= db.cinci.aggregate(pipeline)

for item in highway:
    pprint.pprint(item)

{u'_id': u'residential', u'count': 29517}
{u'_id': u'service', u'count': 25200}
{u'_id': u'turning_circle', u'count': 11144}
{u'_id': u'crossing', u'count': 9872}
{u'_id': u'footway', u'count': 5854}
{u'_id': u'bus_stop', u'count': 3158}
{u'_id': u'tertiary', u'count': 2311}
{u'_id': u'traffic_signals', u'count': 2104}
{u'_id': u'track', u'count': 1124}
{u'_id': u'motorway_link', u'count': 1026}
{u'_id': u'secondary', u'count': 1015}
{u'_id': u'path', u'count': 989}
{u'_id': u'motorway', u'count': 898}
{u'_id': u'primary', u'count': 622}
{u'_id': u'unclassified', u'count': 593}


In [45]:
pipeline = [
    { "$match" : { "service" : { "$exists" : 1 } } },
    { "$group" : { "_id" : "$service",
                   "count" : { "$sum" : 1 } } },
    { "$sort" : { "count" : -1 } }
]
services = db.cinci.aggregate(pipeline)

for item in services:
    pprint.pprint(item)

{u'_id': u'driveway', u'count': 5249}
{u'_id': u'parking_aisle', u'count': 3313}
{u'_id': u'alley', u'count': 1399}
{u'_id': u'spur', u'count': 365}
{u'_id': u'yard', u'count': 352}
{u'_id': u'drive-through', u'count': 91}
{u'_id': u'siding', u'count': 58}
{u'_id': u'parking', u'count': 7}
{u'_id': u'emergency_access', u'count': 5}
{u'_id': u'ramp', u'count': 1}
{u'_id': u'loading', u'count': 1}
{u'_id': u'drive_thru', u'count': 1}
{u'_id': u'2', u'count': 1}
{u'_id': u'drive_through', u'count': 1}
{u'_id': u'weigh_station', u'count': 1}


In [55]:
pipeline = [
    { "$match" : { "amenity" : { "$exists" : 1 } } },
    { "$group" : { "_id" : "$amenity",
                   "count" : { "$sum" : 1 } } },
    { "$sort" : { "count" : -1 } },
    { "$limit" : 20 }
]
amenities = db.cinci.aggregate(pipeline)

for item in amenities:
    pprint.pprint(item)

{u'_id': u'parking', u'count': 2346}
{u'_id': u'place_of_worship', u'count': 1296}
{u'_id': u'school', u'count': 721}
{u'_id': u'fuel', u'count': 613}
{u'_id': u'restaurant', u'count': 504}
{u'_id': u'fast_food', u'count': 406}
{u'_id': u'bench', u'count': 341}
{u'_id': u'bank', u'count': 245}
{u'_id': u'grave_yard', u'count': 146}
{u'_id': u'drinking_water', u'count': 117}
{u'_id': u'fire_station', u'count': 106}
{u'_id': u'toilets', u'count': 102}
{u'_id': u'cafe', u'count': 98}
{u'_id': u'post_office', u'count': 87}
{u'_id': u'pharmacy', u'count': 83}
{u'_id': u'bicycle_parking', u'count': 79}
{u'_id': u'library', u'count': 76}
{u'_id': u'shelter', u'count': 76}
{u'_id': u'car_wash', u'count': 65}
{u'_id': u'fountain', u'count': 51}


In [51]:
pipeline = [
    { "$match" : { "religion" : { "$exists" : 1 } } },
    { "$group" : { "_id" : "$religion",
                   "count" : { "$sum" : 1 } } },
    { "$sort" : { "count" : -1 } }
]
religions = db.cinci.aggregate(pipeline)

for item in religions:
    pprint.pprint(item)

{u'_id': u'christian', u'count': 1507}
{u'_id': u'jewish', u'count': 21}
{u'_id': u'muslim', u'count': 4}
{u'_id': u'scientologist', u'count': 1}


In [52]:
pipeline = [
    { "$match" : { "denomination" : { "$exists" : 1 } } },
    { "$group" : { "_id" : "$denomination",
                   "count" : { "$sum" : 1 } } },
    { "$sort" : { "count" : -1 } }
]
denominations = db.cinci.aggregate(pipeline)

for item in denominations:
    pprint.pprint(item)

{u'_id': u'baptist', u'count': 256}
{u'_id': u'catholic', u'count': 205}
{u'_id': u'methodist', u'count': 121}
{u'_id': u'presbyterian', u'count': 67}
{u'_id': u'roman_catholic', u'count': 45}
{u'_id': u'lutheran', u'count': 37}
{u'_id': u'pentecostal', u'count': 17}
{u'_id': u'protestant', u'count': 17}
{u'_id': u'jehovahs_witness', u'count': 8}
{u'_id': u'mormon', u'count': 6}
{u'_id': u'episcopal', u'count': 6}
{u'_id': u'orthodox', u'count': 3}
{u'_id': u'christ_scientist', u'count': 3}
{u'_id': u'evangelical', u'count': 3}
{u'_id': u'church_of_christ', u'count': 2}
{u'_id': u'salvation_army', u'count': 2}
{u'_id': u'methodist; episcopal', u'count': 1}
{u'_id': u'unitarian', u'count': 1}
{u'_id': u'hasidic', u'count': 1}
{u'_id': u'assemblies_of_god', u'count': 1}
{u'_id': u'wesleyan', u'count': 1}
{u'_id': u'swedenborgian', u'count': 1}
{u'_id': u'presbyterian;episcopal', u'count': 1}
{u'_id': u'seventh_day_adventist', u'count': 1}
{u'_id': u'methodist_episcopal', u'count': 1}
{u'

### Exploration of user participation

In [25]:
import pprint

pipeline = [
    { "$group" : { "_id" : "$created.user",
                   "count" : { "$sum" : 1 } } },
    { "$sort" : { "count" : -1 } }
]
users = db.cinci.aggregate(pipeline)

top_users = 0
top_user_edits = 0
for item in users:
    if item['count'] > 1000:
        pprint.pprint(item)
        top_users += 1
        top_user_edits += item['count']

{u'_id': u'Minh Nguyen', u'count': 899093}
{u'_id': u'lrhill', u'count': 614469}
{u'_id': u'woodpeck_fixbot', u'count': 344876}
{u'_id': u'Nate_Wessel', u'count': 237463}
{u'_id': u'bot-mode', u'count': 13464}
{u'_id': u'drewp', u'count': 12134}
{u'_id': u'Matt Currie', u'count': 12044}
{u'_id': u'TIGERcnl', u'count': 11835}
{u'_id': u'gmensch', u'count': 10499}
{u'_id': u'MichaelGSmith', u'count': 10211}
{u'_id': u'lightbulbsrwarm', u'count': 8282}
{u'_id': u'reportingsjr', u'count': 6271}
{u'_id': u'Chris Davis', u'count': 5881}
{u'_id': u'mikecanann', u'count': 5173}
{u'_id': u'errorcode', u'count': 5076}
{u'_id': u'MountainAddict', u'count': 3781}
{u'_id': u'Jeffrey Jakucyk', u'count': 3722}
{u'_id': u'nickvet419', u'count': 3650}
{u'_id': u'samely', u'count': 2732}
{u'_id': u'aa9yh', u'count': 2643}
{u'_id': u'NE2', u'count': 2328}
{u'_id': u'silentcarto', u'count': 2089}
{u'_id': u'Joeveralls', u'count': 1857}
{u'_id': u'maxerickson', u'count': 1644}
{u'_id': u'itspip', u'count':

In [31]:
print "Number of users w/ more than 1000 edits: ", top_users
print "Total edits made by such users: ", top_user_edits

top_user_edits_perc = (top_user_edits * 100.0) / (nodes + ways)
top_users_perc = (top_users * 100.0) / len(user_list)
print top_user_edits_perc, "percent of all edits were made by", top_users_perc, "percent of users"

Number of users w/ more than 1000 edits:  30
Total edits made by such users:  2228782
98.8063609325 percent of all edits were made by 5.30973451327 percent of users
