-
Notifications
You must be signed in to change notification settings - Fork 491
/
linkedin__geocode.py
111 lines (80 loc) · 3.44 KB
/
linkedin__geocode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
import os
import sys
import cPickle
from urllib2 import HTTPError
from geopy import geocoders
from cluster import KMeansClustering, centroid
# A very uninteresting helper function to build up an XML tree
from linkedin__kml_utility import createKML
K = int(sys.argv[1])
# Use your own API key here if you use a geocoding service
# such as Google or Yahoo!
CONNECTIONS_DATA = sys.argv[2]
if len(sys.argv) > 3:
GEOCODING_API_KEY = sys.argv[3]
OUT = "clusters.kmeans.kml"
# Open up your saved connections with extended profile information
extended_connections = cPickle.load(open(CONNECTIONS_DATA))
locations = [ec['location']['name'] for ec in extended_connections]
g = geocoders.GoogleV3()
# Some basic transforms may be necessary for geocoding services to function properly
# Here are a few examples that seem to cause problems for Yahoo. You'll probably need
# to add your own.
transforms = [('Greater ', ''), (' Area', ''), ('San Francisco Bay',
'San Francisco')]
# Tally the frequency of each location
coords_freqs = {}
for location in locations:
# Avoid unnecessary I/O
if coords_freqs.has_key(location):
coords_freqs[location][1] += 1
continue
transformed_location = location
for transform in transforms:
transformed_location = transformed_location.replace(*transform)
results = []
while True:
num_errors = 0
try:
# This call returns a generator
results = g.geocode(transformed_location, exactly_one=False)
break
except HTTPError, e:
num_errors += 1
if num_errors >= 3:
sys.exit()
print >> sys.stderr, e
print >> sys.stderr, 'Encountered an urllib2 error. Trying again...'
for result in results:
# Each result is of the form ("Description", (X,Y))
coords_freqs[location] = [result[1], 1]
break
# Here, you could optionally segment locations by continent
# country so as to avoid potentially finding a mean in the middle of the ocean
# The k-means algorithm will expect distinct points for each contact so build out
# an expanded list to pass it
expanded_coords = []
for label in coords_freqs:
((lat, lon), f) = coords_freqs[label]
expanded_coords.append((label, [(lon, lat)] * f)) # Flip lat/lon for Google Earth
# No need to clutter the map with unnecessary placemarks...
kml_items = [{'label': label, 'coords': '%s,%s' % coords[0]} for (label,
coords) in expanded_coords]
# It could also be interesting to include names of your contacts on the map for display
for item in kml_items:
item['contacts'] = '\n'.join(['%s %s.' % (ec.first_name, ec.last_name[0])
for ec in extended_connections if ec['location']
== item['label']])
cl = KMeansClustering([coords for (label, coords_list) in expanded_coords
for coords in coords_list])
centroids = [{'label': 'CENTROID', 'coords': '%s,%s' % centroid(c)} for c in
cl.getclusters(K)]
kml_items.extend(centroids)
kml = createKML(kml_items)
if not os.path.isdir('out'):
os.mkdir('out')
f = open("out/" + OUT, 'w')
f.write(kml)
f.close()
print >> sys.stderr, 'Data pickled to out/' + OUT