Skip to content

Commit

Permalink
Set up presentations repo...
Browse files Browse the repository at this point in the history
  • Loading branch information
behackett committed Apr 19, 2011
0 parents commit 286709d
Show file tree
Hide file tree
Showing 8 changed files with 29,712 additions and 0 deletions.
1 change: 1 addition & 0 deletions README
@@ -0,0 +1 @@
Presentations from various conferences.
Binary file added where2_2011/04-19-11-where20.key
Binary file not shown.
3 changes: 3 additions & 0 deletions where2_2011/README
@@ -0,0 +1,3 @@
To load the datasets and geo index them first start mongod then...
$ ./load_data.sh
$ python geo_index_data.py
141 changes: 141 additions & 0 deletions where2_2011/geo_examples.py
@@ -0,0 +1,141 @@
# Examples of doing various Geo queries using pymongo.
# These are just simple examples. No attempt was made
# to optimize the performance of the queries.

import math

import bson
import pymongo

# Approximate radius of Earth according to Google calculator
RADIUS_MILES = 3963
# Very rough scale multiplier for distance on Earth.
# For standard $near queries. Spherical queries just use radius.
DISTANCE_MULTIPLIER = RADIUS_MILES * (math.pi / 180)

# Connect to mongod
connection = pymongo.Connection()

# Some examples require a newer mongod versions...
version_string = (connection.server_info()['version'])[:3]
version = tuple([int(num) for num in version_string.split('.')])

def bart_foreign_cinema(num_stops=1):
"""Find the BART station(s) closest to the Foreign Cinema
in San Francisco.
:Parameters:
- `num_stops`: How many stations to list.
"""
db = connection.bart
cursor = db.stops.find(
{'stop_geo': {'$near': [-122.419088, 37.75689]}}).limit(num_stops)
for doc in cursor:
print doc['stop_name']

# Using geoNear we can get an approximte or spherical
# distance and scale it to our needs
def bart_foreign_cinema_geonear(spherical=False):
"""How far away is the closest BART station to the
Foreign Cinema in San Francisco.
:Parameters:
- `spherical`: Should we do a spherical query?
"""
db = connection.bart
if spherical:
mult = RADIUS_MILES
else:
mult = DISTANCE_MULTIPLIER
q = bson.son.SON({'geoNear': 'stops'})
q.update({'near': [-122.419088, 37.75689]})
q.update({'distanceMultiplier': mult})
if spherical:
q.update({'spherical': True})
results = db.command(q)
name = results['results'][0]['obj']['stop_name']
dist = results['results'][0]['dis']
print "Distance to %s: %r miles" % (name, dist)

map_func = "function(){ emit(this.state, this.pop); }"
reduce_func = "function(key, values){ return Array.sum(values); }"

# Calculate the entire population of the zipcode dataset.
# Inline map reduce just returns the result set instead
# of storing it in the database.
def calculate_population():
"""Use inline map reduce to calculate the entire
population from the zips dataset.
"""
if version < (1, 8):
print "inline map reduce requires mongod >= 1.8"
db = connection.geo
print sum([res['value']
for res
in db.zips.inline_map_reduce(map_func, reduce_func)])

# How many people live within 100 miles of the
# Empire State Building (based on our dataset)?

range_in_miles = 100.0
max_distance = range_in_miles / DISTANCE_MULTIPLIER
nearq = bson.son.SON({'$near': [-73.985656, 40.748433]})
nearq.update({'$maxDistance': max_distance})
# Standard $near queries are limited to a result set of 100 documents.
# We use $within to get around that limitation.
withinq = {'$within': {'$center': [[-73.985656, 40.748433], max_distance]}}

def empire_state_find():
"""How many people live within 100 miles of the Empire State Building
(according to our dataset). This calculates the answer twice. Once
with $within, once with $near.
"""
db = connection.geo
# We only really care about the 'pop' field in the result documents.
# The second parameter of find() tells mongod what fields to return
# to us. MongoDB always returns '_id' unless you tell it not to.
cursor = db.zips.find({'loc': withinq}, {'pop': True, '_id': False})
print '$within: %d' % (sum([doc['pop'] for doc in cursor]),)
cursor = db.zips.find({'loc': nearq},
{'pop': True, '_id': False}).limit(60000)
print '$near: %d' % (sum([doc['pop'] for doc in cursor]),)

def empire_state_spherical():
"""How many people live within 100 miles of the Empire State Building
(according to our dataset). This calculates the answer using $nearSphere
so the distance calulation should be accurate.
"""
db = connection.geo
q = bson.son.SON({'$nearSphere': [-73.985656, 40.748433]})
q.update({'$maxDistance': 100.0 / 3963})
cursor = db.zips.find({'loc': q}).limit(60000)
print '$nearSphere: %d' % (sum([doc['pop'] for doc in cursor]),)

# Using map/reduce or group with GEO queries requires MongoDB 1.9.
# ----------------------------------------------------------------------------
# Same result using map/reduce.
def empire_state_map_reduce():
"""Same $within query from above using map/reduce.
"""
if version < (1, 9):
print "map/reduce with geo requires mongod >= 1.9"
else:
db = connection.geo
result = db.zips.inline_map_reduce(map_func,
reduce_func,
query={'loc': withinq})
print sum([doc['value'] for doc in result])

# Same result using group.
def empire_state_group():
"""Same $within query again using group.
"""
if version < (1, 9):
print "group with geo requires mongod >= 1.9"
else:
db = connection.geo
pop_reduce = "function(obj, prev){ prev.sum += obj.pop; }"
result = db.zips.group(['state'], {'loc': withinq}, {'sum': 0}, pop_reduce)
print sum([doc['sum'] for doc in result])
# ----------------------------------------------------------------------------

44 changes: 44 additions & 0 deletions where2_2011/geo_index_data.py
@@ -0,0 +1,44 @@
#!/usr/bin/python
#
# GeoIndexer for MongoDB Where2.0 Datasets
#
# Original script by:
# Brendan W. McAdams <bmcadams@evilmonkeylabs.com>
#
# Quick and dirty script which creates Geo Indices in MongoDB.
#
# Assumes you already loaded it with the provided shell script.
#
# Needs PyMongo 1.6 or greater

import pymongo
from pymongo import Connection

connection = Connection()
db = connection['bart']
print "Indexing the Stops Data."
for row in db.stops.find():
row['stop_geo'] = [row['stop_lon'], row['stop_lat']]
db.stops.save(row)

db.stops.ensure_index([('stop_geo', pymongo.GEO2D)])
print "Reindexed stops with Geospatial data."

print "Indexing the Shapes data"
for row in db.shapes.find():
row['shape_pt_geo'] = {'lon': row['shape_pt_lon'], 'lat': row['shape_pt_lat']}
db.shapes.save(row)

db.shapes.ensure_index([('shape_pt_geo', pymongo.GEO2D)])
print "Reindexed shapes with Geospatial data."

db = connection['geo']
print "Indexing the Zips Data."
for row in db.zips.find():
row['loc'] = [-(row['loc']['x']), row['loc']['y']]
db.zips.save(row)

db.zips.ensure_index([('loc', pymongo.GEO2D)])
print "Reindexed zips with Geospatial data."

print "Done."
Binary file added where2_2011/google_transit.zip
Binary file not shown.
53 changes: 53 additions & 0 deletions where2_2011/load_data.sh
@@ -0,0 +1,53 @@
#!/bin/sh
#
# Quick & dirty script to load the BART Transit
# Data to mongodb. Assumes mongod is running.
#
# Data file is from: http://www.bart.gov/schedules/developers/gtfs.aspx
# and freely available/redistributable.
#
# Spec on the data format is available at:
# http://code.google.com/transit/spec/transit_feed_specification.html
#
# Original script developed by Brendan W. McAdams

MONGO_IMPORT=/opt/mongo/bin/mongoimport
DOS2UNIX=
IMPORT_CMD="$MONGO_IMPORT -d bart --type csv --headerline --drop --ignoreBlanks"
BART_ZIP="google_transit.zip"
echo "Unzipping BART File ($BART_ZIP)"

unzip -o $BART_ZIP

# Agency is malformed in current file release... Fix it

# add the missing endline character.
echo '\n' >> agency.txt

echo "Loading Agency file..."
perl -pi -e 's/\r\n|\n|\r/\n/g' agency.txt
$IMPORT_CMD -c agency agency.txt
echo "Loading Stops file..."
perl -pi -e 's/\r\n|\n|\r/\n/g' stops.txt
$IMPORT_CMD -c stops stops.txt
echo "Loading Routes file..."
perl -pi -e 's/\r\n|\n|\r/\n/g' routes.txt
$IMPORT_CMD -c routes routes.txt
echo "Loading Trips file..."
perl -pi -e 's/\r\n|\n|\r/\n/g' trips.txt
$IMPORT_CMD -c trips trips.txt
echo "Loading Stop Times file..."
perl -pi -e 's/\r\n|\n|\r/\n/g' stop_times.txt
$IMPORT_CMD -c stop_times stop_times.txt
echo "Loading Calendar file..."
perl -pi -e 's/\r\n|\n|\r/\n/g' calendar.txt
$IMPORT_CMD -c calendar calendar.txt
echo "Loading Calendar Dates File..."
perl -pi -e 's/\r\n|\n|\r/\n/g' calendar_dates.txt
$IMPORT_CMD -c calendar_dates calendar_dates.txt
echo "Loading Shapes file..." # For line drawing - might be usable by GeoMongo
perl -pi -e 's/\r\n|\n|\r/\n/g' shapes.txt
$IMPORT_CMD -c shapes shapes.txt

echo "Loading zips.json to geo..."
$MONGO_IMPORT zips.json -d geo -c zips --drop

0 comments on commit 286709d

Please sign in to comment.