In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('checkins.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396634 entries, 0 to 396633
Data columns (total 5 columns):
user_id       396634 non-null int64
venue_id      396634 non-null int64
latitude      396634 non-null float64
longitude     396634 non-null float64
created_at    396634 non-null object
dtypes: float64(2), int64(2), object(1)
memory usage: 15.1+ MB


In [4]:
data.drop(['user_id', 'venue_id', 'created_at'], axis=1, inplace=True)
data.head()

Unnamed: 0,latitude,longitude
0,38.895112,-77.036366
1,33.800745,-84.41052
2,45.523452,-122.676207
3,40.764462,-111.904565
4,33.448377,-112.074037


In [5]:
from sklearn.cluster import MeanShift

In [6]:
clustering = MeanShift(bandwidth=0.1).fit(data)

In [36]:
data['label'] = clustering.labels_
data.head()

Unnamed: 0,latitude,longitude,label
0,38.895112,-77.036366,4
1,33.800745,-84.41052,7
2,45.523452,-122.676207,29
3,40.764462,-111.904565,92
4,33.448377,-112.074037,1


In [8]:
unique, counts = np.unique(clustering.labels_, return_counts=True)
unique[counts >= 15]

array([   0,    1,    2, ..., 3588, 4013, 4816])

In [9]:
result = data[data.label.isin(unique[counts >= 15])]

In [10]:
from scipy.spatial.distance import euclidean

In [12]:
cluster_centers = clustering.cluster_centers_
cluster_centers

array([[  40.71748459,  -73.98935883],
       [  33.44969472, -112.0025631 ],
       [  41.87816455,  -87.6298271 ],
       ...,
       [ -41.8675416 ,  147.9641979 ],
       [ -43.3659797 ,  170.192856  ],
       [ -45.0311622 ,  168.6626435 ]])

In [13]:
office_coordinates = np.array([[33.751277, -118.188740],
                               [25.867736, -80.324116],
                               [51.503016, -0.075479],
                               [52.378894, 4.885084],
                               [39.366487, 117.036146],
                               [-33.868457, 151.205134]])
office_coordinates

array([[ 3.37512770e+01, -1.18188740e+02],
       [ 2.58677360e+01, -8.03241160e+01],
       [ 5.15030160e+01, -7.54790000e-02],
       [ 5.23788940e+01,  4.88508400e+00],
       [ 3.93664870e+01,  1.17036146e+02],
       [-3.38684570e+01,  1.51205134e+02]])

In [14]:
offices = []
for office_idx, office in enumerate(office_coordinates):
    distances = []
    for cluster_idx, cluster_center in enumerate(cluster_centers):
        distance = (cluster_idx, euclidean(office, cluster_center))
        distances.append(distance)
    offices.append(distances)

In [21]:
answer = min([sorted(office, key=lambda x: x[1])[0] for office in offices], key=lambda x:x[1])

In [22]:
with open('answer.txt', 'w') as file:
    file.write(' '.join(map(str, cluster_centers[answer[0]])))