In [23]:
import numpy as np
import pandas as pd
import sklearn

In [93]:
%%time

data_path = '/Users/smmaurer/Dropbox/Data/Twitter/Westcoast-tweets-processed/'

tw = pd.read_hdf(data_path + 'westcoast-201510.h5', 'tweets') \
     .append(pd.read_hdf(data_path + 'westcoast-201511.h5', 'tweets')) \
     .append(pd.read_hdf(data_path + 'westcoast-201512.h5', 'tweets'))

CPU times: user 1min 9s, sys: 7.85 s, total: 1min 17s
Wall time: 1min 19s


In [5]:
print tw.describe()

                 id           lat           lng       user_id
count  2.061033e+06  2.061033e+06  2.061033e+06  2.061033e+06
mean   6.553610e+17  3.817756e+01 -1.195588e+02  7.251115e+08
std    3.227231e+15  5.348577e+00  3.506524e+00  1.029895e+09
min    6.497704e+17 -3.779492e+01 -1.300296e+02  2.200000e+01
25%    6.526076e+17  3.405220e+01 -1.223310e+02  3.412892e+07
50%    6.554068e+17  3.660024e+01 -1.185519e+02  1.847794e+08
75%    6.580685e+17  3.924222e+01 -1.178956e+02  9.803579e+08
max    6.609889e+17  6.047916e+01  1.512035e+02  4.095923e+09


In [94]:
# Limit to Bay Area

ba = tw.loc[(tw.lat > 36.8) & (tw.lat < 38.9) & 
            (tw.lng > -123.6) & (tw.lng < -121.2)].copy()

print ba.describe()

                 id           lat           lng       user_id
count  1.292488e+06  1.292488e+06  1.292488e+06  1.292488e+06
mean   6.661746e+17  3.779279e+01 -1.222459e+02  1.007188e+09
std    9.553609e+15  3.797599e-01  4.677256e-01  1.320650e+09
min    6.497705e+17  3.680020e+01 -1.235879e+02  2.200000e+01
25%    6.578715e+17  3.761561e+01 -1.224353e+02  2.582908e+07
50%    6.660134e+17  3.776159e+01 -1.222741e+02  1.948950e+08
75%    6.743473e+17  3.783363e+01 -1.219552e+02  2.251424e+09
max    6.830747e+17  3.889966e+01 -1.212000e+02  4.691486e+09


In [95]:
# Scale data to kilometers (approx)

ba['x'] = (ba.lng - ba.lng.mean()) * 89.7
ba['y'] = (ba.lat - ba.lat.mean()) * 112.7

print ba.loc[:,['x','y']].describe()

                  x             y
count  1.292488e+06  1.292488e+06
mean  -5.212992e-12 -1.068724e-12
std    4.195499e+01  4.279894e+01
min   -1.203796e+02 -1.118638e+02
25%   -1.698765e+01 -1.996787e+01
50%   -2.534975e+00 -3.515573e+00
75%    2.606981e+01  4.602996e+00
max    9.381319e+01  1.247445e+02


-122.226987699


In [None]:
'''
Generate quick and dirty clusters of similar spatial extent,
to get a sense of where the top activity destinations are

'''

In [104]:
%%time

k = 500

mbkm = sklearn.cluster.MiniBatchKMeans(n_clusters=k, batch_size=5000, n_init=20)
mbkm.fit(ba.loc[:,['x','y']])

CPU times: user 1min 31s, sys: 2.8 s, total: 1min 34s
Wall time: 23.7 s


In [105]:
# Save cluster assignments and centroids

ba['label'] = mbkm.labels_

clusters = pd.DataFrame(mbkm.cluster_centers_)
clusters.columns = ['x','y']

clusters['lng'] = (clusters.x / 89.7) + ba.lng.mean()
clusters['lat'] = (clusters.y / 112.7) + ba.lat.mean()

print clusters.describe()

                x           y         lng         lat
count  500.000000  500.000000  500.000000  500.000000
mean     7.175216    1.804841 -122.165878   37.808800
std     39.637224   44.774778    0.441887    0.397292
min    -68.741284 -100.228287 -123.012215   36.903448
25%    -15.567125  -25.147111 -122.419415   37.569652
50%      0.512130   -3.101525 -122.240160   37.765265
75%     29.441204   13.042138 -121.917650   37.908510
max     91.364741  118.191708 -121.227310   38.841514


In [106]:
# Add point counts

clusters['n'] = ba.label.value_counts(sort=False)

In [107]:
# Calculate distance from each point to its cluster centroid,
# and report max radius for each cluster

_d = ba.loc[:,['x','y','label']]  # columns needed from data table
_k = clusters.loc[:,['x','y']]  # columns needed from cluster table
_k.columns = ['k_x','k_y']

_d = _d.merge(_k, left_on='label', right_index=True)

_points = _d[['x','y']].as_matrix()
_centroids = _d[['k_x','k_y']].as_matrix()

_d['distance'] = np.sqrt(np.square(_points - _centroids).sum(axis=1))

clusters['radius'] = _d[['label','distance']].groupby('label').max()

In [108]:
print clusters[['n','radius']].sort_values(by='n', ascending=False)

           n     radius
387  46643.0   0.068138
445  42910.0   0.071662
3    38037.0   0.040008
294  34891.0   0.061898
291  34533.0   0.053341
273  30366.0   0.506923
359  28973.0   0.055152
43   26618.0   4.258852
398  22938.0   0.647672
331  20537.0   0.310242
90   14899.0   1.543123
163  13523.0   0.661175
50   11790.0   0.559472
124  10980.0   3.756637
33   10664.0   0.427373
40    9561.0  12.449860
216   9298.0  11.138663
417   9063.0   0.847600
424   8722.0   1.472339
74    8360.0   0.481726
73    8138.0   7.053729
83    8062.0   9.340627
10    7394.0   5.931216
102   7077.0  15.969930
128   7057.0   5.617037
430   6992.0  71.138104
172   6495.0  14.066700
85    6472.0   2.008422
489   6339.0  22.667013
112   6207.0   6.159380
..       ...        ...
150      NaN        NaN
158      NaN        NaN
161      NaN        NaN
178      NaN        NaN
197      NaN        NaN
226      NaN        NaN
252      NaN        NaN
255      NaN        NaN
257      NaN        NaN
268      NaN    

In [103]:
# Save the centroids for visualization

out = clusters[['lng','lat','n']]

out.to_csv('clusters_all.csv')
out.loc[out.n > 5000].to_csv('clusters_top.csv')