In [1]:
from xml.etree import ElementTree
import keytree
import shapely
from shapely.geometry import Polygon, Point, shape

In [2]:
doc = open("/Users/damoncrockett/vosd.org/215-opendsd/Final.kml").read()
tree = ElementTree.fromstring(doc)
# kml namespace
kmlns = tree.tag.split('}')[0][1:]
# find all placemarks
placemks = tree.findall(".//{%s}Placemark" % kmlns)
# filter out those without polygon elements
placemks_with_polygons = []

for p in placemks:
    if p.findall(".//{%s}Polygon" % kmlns):
        placemks_with_polygons.append(p)

In [3]:
# extract kml LinearRings, convert to shapely LinearRings, make dict with cpa names

def coords_names(placemks):
  coords_names_dict = {}
  for placemk in placemks:
    name = placemk.getchildren()[0].text
    coord_text = placemk.findtext(".//{%s}coordinates" % kmlns)
    coords = []
    for elems in coord_text.split():
      points = elems.split(",")
      coords.append((float(points[0]), float(points[1])))
    coords_names_dict[Polygon(coords)] = name
  return coords_names_dict

In [4]:
import pandas as pd

cpa_polygons = pd.DataFrame(coords_names(placemks_with_polygons).items(),
            columns=['Polygon','CPA'])

In [5]:
# import Flickr from SD

d = pd.read_csv('/Users/damoncrockett/Desktop/FISP/Flickr/SD.csv',low_memory=False)

In [7]:
import numpy as np

np.mean(d[d.isnull()])

id       NaN
lon      NaN
lat      NaN
dl_url   NaN
dtype: float64

In [11]:
# collect all lat lon points from dataframe

n = len(d.index)
locations = []
for i in range(n):
    point = Point(d.lon[i],d.lat[i])
    locations.append(point)

In [12]:
# crucial step: build spatial index

from rtree import index
idx = index.Index()
count = -1
for item in cpa_polygons.Polygon:
    count +=1
    idx.insert(count, item.bounds)

In [13]:
# assign a cpa to each point

m = len(locations)
hoods = []
for i in range(m):
    tmp = 'nan'
    for j in idx.intersection((d.lon[i],d.lat[i])):
        if locations[i].within(cpa_polygons.Polygon.loc[j]):
            tmp = cpa_polygons.CPA[j]
            break
    hoods.append(tmp)

In [14]:
d['CPA'] = hoods

In [15]:
d.CPA.value_counts()

DOWNTOWN                               66566
nan                                    58329
BALBOA PARK                            30520
MISSION BAY PARK                       13553
LA JOLLA                                8607
UNIVERSITY                              8405
SAN PASQUAL                             7336
UPTOWN                                  7204
MISSION VALLEY                          5293
PENINSULA                               5224
TIJUANA RIVER VALLEY                    3344
MIRA MESA                               3327
CLAIREMONT MESA                         3047
MILITARY FACILITIES                     3036
OTAY MESA-NESTOR                        2870
PACIFIC BEACH                           2712
OCEAN BEACH                             2368
OLD TOWN SAN DIEGO                      2364
GREATER NORTH PARK                      2277
KEARNY MESA                             1647
RESERVE                                 1584
MIDWAY-PACIFIC HIGHWAY                  1536
NAVAJO    

In [16]:
d.to_csv('/Users/damoncrockett/Desktop/FISP/Flickr/SD_CPA.csv', index=False, encoding='utf-8')