### Problem statement

    - Which store has the farthest distance to its nearest neighboring store?
    - What % of stores are within 1km, 3km and 5km of another store?
    - Which store has the most other stores within 10km?

In [41]:
# load libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# load data
df = pd.read_csv('store_locations.csv')

In [3]:
# sample
df.sample(5)

Unnamed: 0,Store,lon,lat
150,2826,-0.12554,51.5182
187,551,-0.035699,51.3064
199,1662,-0.718167,52.0512
19,369,-7.64188,54.4021
80,1992,-5.75129,56.458


In [4]:
# describe
df.describe()

Unnamed: 0,Store,lon,lat
count,300.0,300.0,300.0
mean,1394.86,-2.124941,53.008898
std,872.449717,1.952026,1.916977
min,6.0,-7.87909,49.9556
25%,612.0,-3.293575,51.494975
50%,1308.0,-1.97174,52.4544
75%,2120.0,-0.33213,54.080725
max,2940.0,1.30192,57.9084


In [5]:
# sample solution - finding nearest latitude-longitude 

from math import cos, asin, sqrt

def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a))

def closest(data, v):
    return min(data, key=lambda p: distance(v['lat'],v['lon'],p['lat'],p['lon']))

tempDataList = [{'lat': 39.7612992, 'lon': -86.1519681}, 
                {'lat': 39.762241,  'lon': -86.158436 }, 
                {'lat': 39.7622292, 'lon': -86.1578917}]

v = {'lat': 39.7622290, 'lon': -86.1519750}
print(closest(tempDataList, v))

{'lat': 39.7612992, 'lon': -86.1519681}


In [6]:
# adding point column to df
df['point'] = [(x, y) for x,y in zip(df['lat'], df['lon'])]

In [39]:
df.tail(7)

Unnamed: 0,Store,lon,lat,point
293,2565,-2.89261,52.897,"(52.897, -2.89261)"
294,1230,-3.8668,57.5185,"(57.5185, -3.8668)"
295,1089,-0.28735,51.5607,"(51.5607, -0.28735)"
296,538,0.861843,51.8544,"(51.8544, 0.861843)"
297,973,-4.25407,55.8459,"(55.8459, -4.25407)"
298,1066,-0.61772,51.1864,"(51.1864, -0.6177199999999999)"
299,2203,-0.1333,51.1167,"(51.1167, -0.1333)"


In [8]:
from scipy.spatial.distance import cdist

In [13]:
def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

def match_value(df, col1, x, col2):
    """ Match value x from col1 row to value in col2. """
    return df[df[col1] == x][col2].values[0]


In [36]:
tl = list(df['point'])
tl[:5]

[(51.2797, -0.836673),
 (55.8214, -4.08744),
 (51.3476, -0.799592),
 (51.4664, -2.6021799999999997),
 (53.5689, -2.42275)]

In [46]:
# using knn

from sklearn.neighbors import NearestNeighbors

def distance(p1, p2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    lon1, lat1 = p1
    lon2, lat2 = p2
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

points = tl

nbrs = NearestNeighbors(n_neighbors=2, metric=distance).fit(points)

distances, indices = nbrs.kneighbors(points)

result = distances[:, 1]

In [47]:
len(result)

300

In [48]:
result

array([  8.59656755,   6.1049544 ,   8.59656755,   2.43362356,
         4.47198077,   5.48712867,   2.89998426,  17.52605129,
         2.6684596 ,   7.93533518,   2.31342682,  20.93197162,
         3.27043196,   1.45798444,  17.52605129,  50.24568219,
        31.66771248,  10.78946304,   1.48620414,  26.83390248,
        39.83600758,   7.44481885,  22.35199435,  15.33237904,
        11.07917243,  14.76446136,  13.28009768,  14.63925675,
        21.28456305,   7.77873533,   4.82121392,  44.53777362,
        22.36417796,  43.02586313,   4.50080855,  18.70944083,
         4.64084257,  43.3661976 ,   4.01026994,  36.09520344,
         3.62270254,  24.73676751,   1.98076628,   7.46031905,
         1.48620414,   3.40594056,  69.34099116,  42.18862094,
        12.88940191,  40.07025531,  30.09809845,  29.40559247,
         5.14404015,  10.78946304,   9.88272321,  11.26323347,
        45.54666684,  44.77682794,   4.04616361,  17.56663102,
         5.14404015,  11.55644564,  10.14200074,  21.96

In [57]:
tl = list(df['point'].values)
tl[:5]

tlx = []
for x in tl:
    tlx.append([x[0],x[1]])

tlx[:5]

[[51.2797, -0.836673],
 [55.8214, -4.08744],
 [51.3476, -0.799592],
 [51.4664, -2.6021799999999997],
 [53.5689, -2.42275]]

In [59]:
# simple approach

fl = []

# closest node
def closest_node(node, nodes):
    closest_index = cdist([node], nodes).argmin()
    return nodes[closest_index]

closest_node(tlx[0], tlx[1:])

[51.3476, -0.799592]