In [61]:
#!/usr/bin/python
import json
import logging
import os
import csv
import keytree

from urllib import urlopen
from xml.etree import ElementTree
from pyzipcode import ZipCodeDatabase
from shapely.geometry import Point, shape
from geopy.geocoders import Nominatim

class ZipGeoCoder(object):
    elems_to_zip_code = {}

    def __init__(self, zip_code_boundary_csv_file='data/bay_area_zip_code_boundaries.csv'):
        with open(zip_code_boundary_csv_file) as f:
            csv_reader = csv.DictReader(f)
            for row in csv_reader:
                zip_code = row['ZCTA5CE10']
                geometry = "<kml xmlns='http://www.opengis.net/kml/2.2'>" + row['geometry'] + "</kml>"
                tree = ElementTree.fromstring(geometry)  
                kmlns = tree.tag.split('}')[0][1:]
                elems = tree.findall(".//{%s}Polygon" % kmlns)
                for e in elems:
                    self.elems_to_zip_code[e] = zip_code
                
    def zip_code(self, latitude, longitude):
        # Here's our point of interest
        p = Point(longitude, latitude)
        hits = filter(
            lambda e: shape(keytree.geometry(e)).contains(p),
            self.elems_to_zip_code.keys() )
        if hits:
            return self.elems_to_zip_code[hits[0]]

class AddessGeoCoder(object):
    geolocator = Nominatim()
                
    def address(self, latitude, longitude):
        location = self.geolocator.reverse("%f, %f" % (latitude, longitude))
        if location and "address" in location.raw:
            return location.raw["address"]
        

geo_coder = ZipGeoCoder()
zcdb = ZipCodeDatabase()   
address_geo_coder = AddessGeoCoder()

In [31]:
zipcode = geo_coder.zip_code(37.268082, -121.908918)
print zipcode
print zcdb[zipcode].city


95124
San Jose


In [89]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()

location = geolocator.reverse("%f, %f" % (36.911882, -121.744652))
print location.raw

{u'display_name': u'Riverside Road, Freedom, Santa Cruz County, California, 95076, United States of America', u'place_id': u'56760417', u'lon': u'-121.7448889', u'boundingbox': [u'36.9124747', u'36.9126522', u'-121.7454528', u'-121.7448889'], u'osm_type': u'way', u'licence': u'Data \xa9 OpenStreetMap contributors, ODbL 1.0. http://www.openstreetmap.org/copyright', u'osm_id': u'10554240', u'lat': u'36.9126522', u'address': {u'country': u'United States of America', u'county': u'Santa Cruz County', u'suburb': u'Freedom', u'state': u'California', u'postcode': u'95076', u'country_code': u'us', u'road': u'Riverside Road'}}


In [101]:
# Load properties.
with open('data/bay_area_rentals.json') as f_in: 
    properties = json.load(f_in)
print "%d properties found." % len(properties)

710 properties found.


In [102]:
# Dedupe properties.
dedupe_property_map = {}
for prop in properties:
    key = ":".join([
        prop.get("title"), 
        str(prop.get("bathrooms")), 
        str(prop.get("bedrooms")), 
        str(prop.get("price"))
    ])
    dedupe_property_map[key] = prop
properties = dedupe_property_map.values()
print "%d properties deduped." % len(properties)

536 properties deduped.


In [103]:
# Update json file.
updated_properties = 0
properties_updated = []
for prop in properties:
    lat = prop.get("latitude")
    lon = prop.get("longitude")
    if not lat or not lon:
        continue

    address = address_geo_coder.address(lat, lon)
    if not address:
        print "Couldn't find address for: %f, %f" % (lat, lon)
        continue

    zipcode = address.get("postcode")
    if not zipcode:
        print "Couldn't find zip code for: %f, %f" % (lat, lon)
        continue

    city = (
        address.get("city") or 
        address.get("town") or 
        address.get("village") or 
        address.get("suburb") or 
        address.get("hamlet")
    )
    if not city:    
        print "Couldn't find city for: %f, %f" % (lat, lon)
        continue

    prop["zipcode"] = zipcode
    prop["city"] = city   
    properties_updated.append(prop)
    updated_properties += 1
print "%d properties updated." % len(properties_updated)


493 properties updated.


In [104]:
# Write geocoded properties to json.
with open('data/bay_area_rentals_geo_coded.json', 'w') as f_out:
    json_str = "[%s]" % ",\n".join([json.dumps(p) for p in properties_updated])
    f_out.write(json_str)

In [106]:
# Write geocoded properties to csv.
with open('data/bay_area_rentals_geo_coded.csv', 'w') as csvfile:
    fieldnames = [
        'craigslist_id', 
        'title', 
        'price',
        'city', 
        'zipcode',
        'bedrooms', 
        'bathrooms', 
        'building_size',
        'link', 
        'latitude',
        'longitude', 
        'posting_date'
    ]
    writer = csv.writer(csvfile)
    writer.writerow(fieldnames)
    for prop in properties_updated:
        writer.writerow([
            prop.get('craigslist_id'), 
            prop.get('title').encode('utf-8'), 
            prop.get('price'),
            prop.get('city', u'').encode('utf-8'), 
            prop.get('zipcode'),
            prop.get('bedrooms'), 
            prop.get('bathrooms'), 
            prop.get('building_size'),
            prop.get('link'), 
            prop.get('latitude'),
            prop.get('longitude'), 
            prop.get('posting_date')
        ])