### OSM Data ###



* I am using OpenStreetMap data of Mountain View city downloaded from [mapzen](https://mapzen.com/data/metro-extracts/). The date of downloading the dataset is March 27, 2017 at 10:21 AM.
* The format of datafile is in XML format, and we can find the description og Open Street XML format [here](http://wiki.openstreetmap.org/wiki/OSM_XML).

### Issues in Mountiav View OSM data ###

* There are some inconsistencies in the names of streets, some are incorrect and abbreviated.


### Overview of Mountain View OSM data ###

The dataset description is given as


#### Size of data file ####
* MountainView.osm(The original downloaded OpenStreetMap in xml format): 209MB
* MountainView.osm.json(The processed OpenStreetMap in json format): 346MB

#### Summary statistics of dataset ####

* Number of documents: 5754659
* Number of unique users: 880
* Number of nodes: 5136303
* Number of ways: 618301

### References ###

1. [Undacity Sample Data Wrangling Project](https://docs.google.com/document/d/1F0Vs14oNEs2idFJR3C_OPxwS6L0HPliOii-QpbmrMo4/pub)

### Code and Results ###
There several queries generated for look deeep insight of data.





<b>Import Libraries</b>

In [1]:
# load libraries
import os
import xml.etree.cElementTree as cET
from collections import defaultdict
import pprint
import re
import codecs
import json
import string
from pymongo import MongoClient

In [2]:
# set up map file path
filename = "MountainView.osm" # osm filename
# filename = "sample200.osm" # Sample osm filename
path = "/Users/seemamishra/Desktop/Udacity/Data analyst nano degree/Data Wrangling/P3_Data" # directory contain the osm file
MountainViewosm = os.path.join(path, filename)

# MountainViewosm = "MountainView.osm" # osm filename
# path = "d:\GithubRepos\Udacity\P3" # directory contain the osm file
lower = re.compile(r'^([a-z]|_)*$') 
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
# initial version of expected street names
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane",
            "Road", "Trail", "Parkway", "Commons"]
MountainViewosm

'/Users/seemamishra/Desktop/Udacity/Data analyst nano degree/Data Wrangling/P3_Data/MountainView.osm'

#### Count the number of Tags ###

In [3]:
# Iterative parsing
def count_tags(filename):
    
    # make empty defaultdict
#     from collections import defaultdict
    tags_dict = defaultdict(int)
    
    # use the iterparse method to find all the tags
    for event, element in cET.iterparse(filename, events=("start", "end")):
#         print event
        tags_dict[element.tag] += 1
        
    # return your results 
    return tags_dict

if __name__ == "__main__":
    print count_tags(MountainViewosm)

defaultdict(<type 'int'>, {'node': 2048376, 'nd': 2318540, 'bounds': 2, 'member': 10530, 'tag': 835590, 'osm': 2, 'way': 246602, 'relation': 2532})


####  Tags types ###

In [4]:
# Tag types
def key_type(element, keys):
    if element.tag == "tag":
    
        k = element.attrib['k']
#         print k
        # serach k to see if it matches each regular expression
        if lower.search(k):
            keys['lower'] += 1
        elif lower_colon.search(k):
            keys['lower_colon'] += 1
        elif problemchars.search(k):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
           
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in cET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


if __name__ == "__main__":
    print process_map(MountainViewosm)

{'problemchars': 25, 'lower': 226487, 'other': 4962, 'lower_colon': 186321}


#### Audit the street names ###

In [5]:
def audit_street_type(street_types, street_name):
    # add unexpected street name to a list
    m = street_type_re.search(street_name)
#     print m
    if m:
        street_type = m.group()
#         street_type
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def is_street_name(elem):
    # determine whether a element is a street name
    return (elem.attrib['k'] == "addr:street")

def audit_street(osmfile):
    # iter through all street name tag under node or way and audit the street name value
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in cET.iterparse(osm_file, events=("start","end")):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types
if __name__ == '__main__':
    st_types = audit_street(MountainViewosm)
    # print out unexpected street names
    pprint.pprint(dict(st_types))




{'114': set(['West Evelyn Avenue Suite #114']),
 '140': set(['Hamilton Ave #140']),
 '2': set(['Showers Drive STE 2']),
 '7': set(['Showers Drive STE 7']),
 '9': set(['East Charleston Road APT 9']),
 'AA': set(['Showers Drive BLDG AA']),
 'Alley': set(['Jackson Alley']),
 'Ave': set(['California Ave',
             'E Duane Ave',
             'El Monte Ave',
             'Hollenbeck Ave',
             'Portage Ave',
             'S California Ave',
             'University Ave',
             'W Maude Ave',
             'W Washington Ave']),
 'Ave.': set(['Menalto Ave.']),
 'B': set(['Leghorn Street #B']),
 'Bruno': set(['Serra San Bruno']),
 'C': set(['Plymouth Street #C']),
 'Calle': set(['La Calle']),
 'Central': set(['Plaza Central']),
 'Circle': set(['Bobolink Circle',
                'Carlson Circle',
                'Comstock Circle',
                'Continental Circle',
                'Distel Circle',
                'Duluth Circle',
                'East Meadow Circle',
      

#### Update the street name ###

In [6]:
# Street name updatation
# creating a dictionary for correcting street names
mapping = { "AA" :"Aberdeen Athletic Center",
            "Ct": "Court",
            "Ct.": "Court",
            "St.": "Street",
            "St,": "Street",
            "ST": "Street",
            "street": "Street",
            "STE": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "ave": "Avenue",
            "Rd.": "Road",   
            "rd.": "Road",
            "Rd": "Road",    
            "Hwy": "Highway",
            "HIghway": "Highway",
            "BLDG": "Building",
            "APT": "Apartment",
           "West Evelyn Avenue Suite #114":"West Evelyn Avenue",
           "Showers Drive STE 2": "Showers Drive Street",
           "Showers Drive STE 7": "Showers Drive Street",
           "East Charleston Road APT 9": "East Charleston Road Apartment",
           "Leghorn Street #B": "Leghorn Street",
           "Plymouth Street #C": "Plymouth Street",
           "Hamilton Ave #140": "Hamilton Ave"
           }
           
                     
# function that corrects incorrect street names
def update_name(name, mapping):    
    for key in mapping:
        if key in name:
            name = string.replace(name,key,mapping[key])
    return name
if __name__ == '__main__':
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

Villa Vista => Villa Vista
Roble Ridge => Roble Ridge
Vanderbilt Court West => Vanderbilt Court West
Wolfe Rd => Wolfe Road
Homestead Rd => Homestead Road
E Middlefield Rd => E Middlefield Road
Embarcadero Rd => Embarcadero Road
West Evelyn Avenue Suite #114 => West Evelyn Avenuenue
Serra San Bruno => Serra San Bruno
Devonshire Way => Devonshire Way
Aspen Way => Aspen Way
Flicker Way => Flicker Way
Asbury Way => Asbury Way
Davenport Way => DAvenuenuenport Way
Madera Way => Madera Way
Wintergreen Way => Wintergreen Way
La Jennifer Way => La Jennifer Way
Hansen Way => Hansen Way
Murray Way => Murray Way
Elbridge Way => Elbridge Way
Enderby Way => Enderby Way
Acacia Way => Acacia Way
Alley Way => Alley Way
Brahms Way => Brahms Way
Primrose Way => Primrose Way
Bond Way => Bond Way
Hudson Way => Hudson Way
Dunnock Way => Dunnock Way
Golden Way => Golden Way
Forge Way => Forge Way
Anaconda Way => Anaconda Way
Prince Edward Way => Prince Edward Way
Old Middlefield Way => Old Middlefield Way
A

#### Process OSM XML file to JSON ###

In [17]:

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
    node = {}
    node["created"]={}
    node["address"]={}
    node["pos"]=[]
#     node["amenity"] ={}
#     node["cuisine"] = {}
    refs=[]
    
    # we only process the node and way tags
    if element.tag == "node" or element.tag == "way" :
        if "id" in element.attrib:
            node["id"]=element.attrib["id"]
        node["type"]=element.tag

        if "visible" in element.attrib.keys():
            node["visible"]=element.attrib["visible"]
      
        # the key-value pairs with attributes in the CREATED list are added under key "created"
        for elem in CREATED:
            if elem in element.attrib:
                node["created"][elem]=element.attrib[elem]
                
        # attributes for latitude and longitude are added to a "pos" array
        # include latitude value        
        if "lat" in element.attrib:
            node["pos"].append(float(element.attrib["lat"]))
        # include longitude value    
        if "lon" in element.attrib:
            node["pos"].append(float(element.attrib["lon"]))

        
        for tag in element.iter("tag"):
            if not(problemchars.search(tag.attrib['k'])):
                if tag.attrib['k'] == "addr:housenumber":
                    node["address"]["housenumber"]=tag.attrib['v']
                    
                if tag.attrib['k'] == "addr:postcode":
                    node["address"]["postcode"]=tag.attrib['v']
                
                # handling the street attribute, update incorrect names using the strategy developed before   
                if tag.attrib['k'] == "addr:street":
                    node["address"]["street"]=tag.attrib['v']
                    node["address"]["street"] = update_name(node["address"]["street"], mapping)

                if tag.attrib['k'].find("addr")==-1:
                    node[tag.attrib['k']]=tag.attrib['v']
                    
        for nd in element.iter("nd"):
             refs.append(nd.attrib["ref"])
                
        if node["address"] =={}:
            node.pop("address", None)

        if refs != []:
           node["node_refs"]=refs
            
        return node
    else:
        return None



def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in cET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [18]:
# process the file
data = process_map(MountainViewosm, True)
print data


#### Insert the JSON data into MongoDB Database ####

In [10]:
client = MongoClient()
db = client.MountainViewosm
collection = db.MountainViewMAP
collection.insert(data)



[ObjectId('58e7bf031750d201a7d2d059'),
 ObjectId('58e7bf031750d201a7d2d05a'),
 ObjectId('58e7bf031750d201a7d2d05b'),
 ObjectId('58e7bf031750d201a7d2d05c'),
 ObjectId('58e7bf031750d201a7d2d05d'),
 ObjectId('58e7bf031750d201a7d2d05e'),
 ObjectId('58e7bf031750d201a7d2d05f'),
 ObjectId('58e7bf031750d201a7d2d060'),
 ObjectId('58e7bf031750d201a7d2d061'),
 ObjectId('58e7bf031750d201a7d2d062'),
 ObjectId('58e7bf031750d201a7d2d063'),
 ObjectId('58e7bf031750d201a7d2d064'),
 ObjectId('58e7bf031750d201a7d2d065'),
 ObjectId('58e7bf031750d201a7d2d066'),
 ObjectId('58e7bf031750d201a7d2d067'),
 ObjectId('58e7bf031750d201a7d2d068'),
 ObjectId('58e7bf031750d201a7d2d069'),
 ObjectId('58e7bf031750d201a7d2d06a'),
 ObjectId('58e7bf031750d201a7d2d06b'),
 ObjectId('58e7bf031750d201a7d2d06c'),
 ObjectId('58e7bf031750d201a7d2d06d'),
 ObjectId('58e7bf031750d201a7d2d06e'),
 ObjectId('58e7bf031750d201a7d2d06f'),
 ObjectId('58e7bf031750d201a7d2d070'),
 ObjectId('58e7bf031750d201a7d2d071'),
 ObjectId('58e7bf031750d2

In [12]:
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'MountainViewosm'), u'MountainViewMAP')

#### Size of original XML file ####

In [13]:

os.path.getsize(os.path.join(path, "MountainView.osm"))/1024/1024

209

#### Size of processed JSON  file ####

In [14]:

os.path.getsize(os.path.join(path, "MountainView.osm.json"))/1024/1024

346

#### Number of documents ####

In [15]:
collection.find().count()


5754659

#### Number of unique users ####

In [19]:
# Number of unique users
len(collection.group(["created.uid"], {}, {"count":0}, "function(o, p){p.count++}"))


880

#### Number of nodes ####

In [20]:
# Number of nodes
collection.find({"type":"node"}).count()

5136303

#### Number of ways ####

In [21]:
collection.find({"type":"way"}).count()

618301

#### Top 10 methods used to create data entry ####

In [38]:

pipeline = [{"$group":{"_id": "$created_by",
                       "count": {"$sum": 1}}},
                     {"$sort": {"count": -1}},
                    {"$limit": 10}]
           
result = collection.aggregate(pipeline)
for r in result:
    print r
# assert len(result['result'])

# print(len(result['result']))
# print result[result]

{u'count': 5746814, u'_id': None}
{u'count': 4363, u'_id': u'JOSM'}
{u'count': 1490, u'_id': u'Potlatch 0.10f'}
{u'count': 896, u'_id': u'Potlatch 0.9c'}
{u'count': 290, u'_id': u'Potlatch 0.10b'}
{u'count': 243, u'_id': u'Potlatch 0.10'}
{u'count': 215, u'_id': u'Potlatch 0.8c'}
{u'count': 105, u'_id': u'Potlatch 0.9a'}
{u'count': 98, u'_id': u'Potlatch 0.10e'}
{u'count': 80, u'_id': u'OSMPointy v0.4 iPhone'}


#### Top 5 users contributions ####

In [23]:
# top three users with most contributions
pipeline = [{"$group":{"_id": "$created.user",
                       "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 5}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 760464, u'_id': u'RichRico'}
{u'count': 707471, u'_id': u'ediyes'}
{u'count': 601207, u'_id': u'samely'}
{u'count': 582349, u'_id': u'karitotp'}
{u'count': 439029, u'_id': u'calfarome'}


#### Most popular fast food resturant ####

In [24]:
# Most popular cuisines
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity":"restaurant", "cuisine":{"$exists":1}}}, 
            {"$group":{"_id":"$cuisine", "count":{"$sum":1}}},        
            {"$sort":{"count":-1}}, 
            {"$limit":10}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 376, u'_id': {}}
{u'count': 190, u'_id': u'mexican'}
{u'count': 175, u'_id': u'chinese'}
{u'count': 170, u'_id': u'japanese'}
{u'count': 145, u'_id': u'indian'}
{u'count': 130, u'_id': u'pizza'}
{u'count': 80, u'_id': u'thai'}
{u'count': 75, u'_id': u'italian'}
{u'count': 65, u'_id': u'american'}
{u'count': 65, u'_id': u'vietnamese'}


#### Name of Universities ####

In [25]:
# University
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "university", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r


{u'count': 5, u'_id': u'Stanford University'}
{u'count': 5, u'_id': u'Carnegie Mellon University Silicon Valley'}
{u'count': 5, u'_id': u'Singularity University Classroom '}
{u'count': 5, u'_id': u'Singularity University'}
{u'count': 5, u'_id': u'20'}
{u'count': 5, u'_id': u'Nine Star University of Health Sciences'}


#### Places for worship ####

In [26]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "place_of_worship", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 10, u'_id': u'The Church of Jesus Christ of Latter-day Saints'}
{u'count': 10, u'_id': u'First Church of Christ Scientist'}
{u'count': 10, u'_id': u'Trinity United Methodist Church'}
{u'count': 10, u'_id': u'First United Methodist Church'}
{u'count': 10, u'_id': u'Seventh Day Adventist Church'}
{u'count': 10, u'_id': u'Mountain View Chinese Christian Church'}
{u'count': 10, u'_id': u'Peninsula Bible Church'}
{u'count': 5, u'_id': u'Unity Missionary Baptist Church'}
{u'count': 5, u'_id': u'Cornerstone Community Church'}
{u'count': 5, u'_id': u"Saint Mark's Missionary Baptist Church"}
{u'count': 5, u'_id': u'Foothill Baptist Church'}
{u'count': 5, u'_id': u'Kannon Do Zen Meditation Center'}
{u'count': 5, u'_id': u'New Hope International'}
{u'count': 5, u'_id': u'Holy Korean Martyrs Catholic Church'}
{u'count': 5, u'_id': u'Ch\xf9a Gi\xe1c Minh'}
{u'count': 5, u'_id': u'Open Door Church of God in Christ'}
{u'count': 5, u'_id': u'Saint Athanasius Catholic Church'}
{u'count': 5, 

In [None]:
#### Gas stations ####

In [27]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "fuel", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 50, u'_id': u'Shell'}
{u'count': 40, u'_id': u'Valero'}
{u'count': 35, u'_id': u'Chevron'}
{u'count': 30, u'_id': u'76'}
{u'count': 30, u'_id': u'Arco'}
{u'count': 10, u'_id': u'valero'}
{u'count': 10, u'_id': u'ARCO'}
{u'count': 5, u'_id': u'World Oil'}
{u'count': 5, u'_id': u'Fair Oaks 76'}
{u'count': 5, u'_id': u'Willow Cove Gas'}
{u'count': 5, u'_id': u'Alliance Gasoline'}
{u'count': 5, u'_id': u'Westmore Chevron'}
{u'count': 5, u'_id': u'Shell Gas'}
{u'count': 5, u'_id': u'Conoco Phillips 76'}
{u'count': 5, u'_id': u'Union 76'}
{u'count': 5, u'_id': u'SAP Vehicles Network Demo - Gas Station'}
{u'count': 5, u'_id': u"Ranier's Service Station"}
{u'count': 5, u'_id': u'Alliance'}
{u'count': 5, u'_id': u'Rotten Robbie'}


In [None]:
#### Most popular Fast food cuisines ####

In [28]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "fast_food", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 70, u'_id': u'Subway'}
{u'count': 35, u'_id': u"McDonald's"}
{u'count': 25, u'_id': u"Togo's"}
{u'count': 25, u'_id': u'Taco Bell'}
{u'count': 15, u'_id': u'Burger King'}
{u'count': 15, u'_id': u'Round Table Pizza'}
{u'count': 15, u'_id': u'KFC'}
{u'count': 10, u'_id': u'Jack In The Box'}
{u'count': 10, u'_id': u'In-N-Out Burger'}
{u'count': 10, u'_id': u'Jack in the Box'}
{u'count': 10, u'_id': u'Jamba Juice'}
{u'count': 10, u'_id': u"Wendy's"}
{u'count': 5, u'_id': u'Falafel Stop'}
{u'count': 5, u'_id': u'Peninsula Creamery'}
{u'count': 5, u'_id': u"Domino's Pizza"}
{u'count': 5, u'_id': u'Chick-Fil-A'}
{u'count': 5, u'_id': u"Carls' Jr."}
{u'count': 5, u'_id': u'Burger Town'}
{u'count': 5, u'_id': u'Tacobell-KFC'}
{u'count': 5, u'_id': u'Adamsons French Dipp'}
{u'count': 5, u'_id': u'El Pollo Loco'}
{u'count': 5, u'_id': u"Arby's"}
{u'count': 5, u'_id': u"In'n Out Burger"}
{u'count': 5, u'_id': u'zpizza'}
{u'count': 5, u'_id': u'Zume Pizza'}
{u'count': 5, u'_id': u'Cosmic

In [None]:
#### Number of hospitals ####

In [29]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "hospital", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 5, u'_id': u'Grant Cuesta Sub Acute Rehabilitation Center'}
{u'count': 5, u'_id': u'VA Palo Alto Health Care System'}
{u'count': 5, u'_id': u'VA Medical Center Menlo Park'}
{u'count': 5, u'_id': u'El Camino Hospital'}
{u'count': 5, u'_id': u'PAMF Menlo Park Surgical Hospital'}
{u'count': 5, u'_id': u'Kaiser Permanente'}
{u'count': 5, u'_id': u'Kaiser Permanente Santa Clara Medical Center'}
{u'count': 5, u'_id': u'Camino Medical Group'}
{u'count': 5, u'_id': u'Palo Alto Medical Foundation'}
{u'count': 5, u'_id': u'Health Services'}


In [None]:
#### Beauty Salon ####

In [30]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "beauty", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 8, u'_id': u'Salon Elizabeth'}
{u'count': 5, u'_id': u'Salon 121'}


In [None]:
#### Libraries ####

In [31]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "public_bookcase", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 13, u'_id': u'Little Free Library'}


In [None]:
#### Schools ####

In [32]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "school", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 10, u'_id': u'Lucille M Nixon Elementary School'}
{u'count': 10, u'_id': u'Palo Verde Elementary School'}
{u'count': 10, u'_id': u'Ohlone Elementary School'}
{u'count': 10, u'_id': u'Pinewood School'}
{u'count': 10, u'_id': u'Union Academy'}
{u'count': 10, u'_id': u'Stratford School'}
{u'count': 10, u'_id': u'Laurelwood Elementary School'}
{u'count': 10, u'_id': u'Athena Academy'}
{u'count': 10, u'_id': u'Kumon'}
{u'count': 8, u'_id': u'Jane Lathrop Stanford Middle School'}
{u'count': 5, u'_id': u'Bullis Charter School - South Campus'}
{u'count': 5, u'_id': u'Saint Elizabeth Ann Seton School'}
{u'count': 5, u'_id': u'Egan Junior High School'}
{u'count': 5, u'_id': u'Mountain View Los Altos Adult Education'}
{u'count': 5, u'_id': u"Mountain View-Los Altos Montessori Children's Centerter"}
{u'count': 5, u'_id': u'Terman Middle School'}
{u'count': 5, u'_id': u'Saint Simon Elemntary School'}
{u'count': 5, u'_id': u'Fairmeadow Elementary School'}
{u'count': 5, u'_id': u'Edison Br

In [None]:
#### Parkings ####

In [33]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "parking", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 65, u'_id': u"Visitor's Parking"}
{u'count': 53, u'_id': u'Apartment Visitor Parking'}
{u'count': 15, u'_id': u'Customer Parking'}
{u'count': 15, u'_id': u'Employee Parking'}
{u'count': 10, u'_id': u'Lot 1'}
{u'count': 10, u'_id': u'Lot 4'}
{u'count': 10, u'_id': u'Sunnyvale Caltrain Station'}
{u'count': 10, u'_id': u'Lot 8'}
{u'count': 10, u'_id': u'Lot 6'}
{u'count': 10, u'_id': u'Lot 7'}
{u'count': 5, u'_id': u'Parking Lot A'}
{u'count': 5, u'_id': u'San Antonio Park and Ride'}
{u'count': 5, u'_id': u'SAP Vehicles Network Demo - Parking'}
{u'count': 5, u'_id': u'Civic Center Parking'}
{u'count': 5, u'_id': u'Car parking'}
{u'count': 5, u'_id': u'Car Parking'}
{u'count': 5, u'_id': u'Whisman'}
{u'count': 5, u'_id': u'San Antonio'}
{u'count': 5, u'_id': u'California Avenue'}
{u'count': 5, u'_id': u'Apple Mathilda 4'}
{u'count': 5, u'_id': u'Ted Thompson Parking Garage'}
{u'count': 5, u'_id': u'Lot 3'}
{u'count': 5, u'_id': u'parking lot'}
{u'count': 5, u'_id': u'Post office

In [None]:
#### Car wash ####

In [34]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "car_wash", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 5, u'_id': u'SV Express'}
{u'count': 5, u'_id': u'Clear Water Car Wash'}
{u'count': 5, u'_id': u'Thrifty'}
{u'count': 5, u'_id': u'Lozano Brushless Car Wash'}
{u'count': 5, u'_id': u"Lozano's Car Wash"}
{u'count': 5, u'_id': u'Car Wash'}
{u'count': 5, u'_id': u'Shell'}
{u'count': 5, u'_id': u'Bubbles Hand Wash'}


In [None]:
#### Post officess ####

In [35]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "post_box", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 5, u'_id': u'Mailbox'}


In [None]:
#### Coffe shops ####

In [36]:
pipeline = [{"$match":{"amenity":{"$exists":1}, "amenity": "cafe", "name":{"$exists":1}}},
            {"$group":{"_id":"$name", "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]
result = collection.aggregate(pipeline)
for r in result:
    print r

{u'count': 105, u'_id': u'Starbucks'}
{u'count': 15, u'_id': u'Starbucks Coffee'}
{u'count': 15, u'_id': u'Tea Era'}
{u'count': 10, u'_id': u'Philz Coffee'}
{u'count': 10, u'_id': u'Cloud Cafe'}
{u'count': 10, u'_id': u"Peet's Coffee"}
{u'count': 10, u'_id': u"Peet's Coffee & Tea"}
{u'count': 10, u'_id': u'Peets Coffee'}
{u'count': 5, u'_id': u'Happy Donuts'}
{u'count': 5, u'_id': u'Island Cafe'}
{u'count': 5, u'_id': u'Dana Street Roasting Company'}
{u'count': 5, u'_id': u'Maverick'}
{u'count': 5, u'_id': u'Badaal'}
{u'count': 5, u'_id': u'MiXiT Charleston'}
{u'count': 5, u'_id': u'Threads'}
{u'count': 5, u'_id': u'WAN'}
{u'count': 5, u'_id': u'California Mochi Mini'}
{u'count': 5, u'_id': u'Big Bites Vietnamese Eatery'}
{u'count': 5, u'_id': u'Taj Cafe'}
{u'count': 5, u'_id': u'Big Table'}
{u'count': 5, u'_id': u"Charlie's cafe"}
{u'count': 5, u'_id': u'The Lunch Box'}
{u'count': 5, u'_id': u'moma2'}
{u'count': 5, u'_id': u'Portal Cafe'}
{u'count': 5, u'_id': u'Backyard'}
{u'count': 