In [2]:
import pandas as pd
import numpy as np
import sys
import math
from ipywidgets import FloatProgress
from IPython.display import display

In [2]:
train_businesses = pd.read_csv("yelp_business.csv")

In [3]:
from sklearn.neighbors import BallTree
import numpy as np
RADIANT_TO_KM_CONSTANT = 6367
class BallTreeIndex:
    def __init__(self,lat_longs):
        self.lat_longs = np.radians(lat_longs)
        self.ball_tree_index = BallTree(self.lat_longs, metric='haversine')
        
    def query_radius(self,query):
        radius_km = 20.1168 # 12.5 miles to km
        radius_radian = radius_km / RADIANT_TO_KM_CONSTANT 
        query = np.radians(np.array([query]))
        indices = self.ball_tree_index.query_radius(query,r=radius_radian,count_only=True)     
        return indices

In [4]:
train_businesses.drop(train_businesses.columns.difference(['business_id','latitude','longitude','state']), 1, inplace=True)

In [5]:
train_businesses['lat_long'] = [[] for i in range(len(train_businesses))]

In [6]:
train_businesses.head()

Unnamed: 0,business_id,state,latitude,longitude,lat_long
0,FYWN1wneV18bWNgQjJ2GNg,AZ,33.33069,-111.978599,[]
1,He-G7vWjzVUysIKrfNbPUQ,PA,40.291685,-80.1049,[]
2,KQPW8lFf1y5BT2MxiSZ3QA,AZ,33.524903,-112.11531,[]
3,8DShNS-LuFqpEWIp0HxijA,AZ,33.383147,-111.964725,[]
4,PfOCPjBrlQAnz__NXj9h_w,OH,41.119535,-81.47569,[]


In [7]:
for index, row in train_businesses.iterrows():
    train_businesses.set_value(index, 'lat_long', [row['latitude'], row['longitude']])

In [8]:
train_businesses.drop(['latitude', 'longitude'], axis=1)

Unnamed: 0,business_id,state,lat_long
0,FYWN1wneV18bWNgQjJ2GNg,AZ,"[33.3306902, -111.9785992]"
1,He-G7vWjzVUysIKrfNbPUQ,PA,"[40.2916853, -80.1048999]"
2,KQPW8lFf1y5BT2MxiSZ3QA,AZ,"[33.5249025, -112.1153098]"
3,8DShNS-LuFqpEWIp0HxijA,AZ,"[33.3831468, -111.96472539999999]"
4,PfOCPjBrlQAnz__NXj9h_w,OH,"[41.1195346, -81.4756898]"
5,o9eMRCWt5PkpLDE0gOPtcQ,BW,"[48.7272, 9.14795]"
6,kCoE3jvEtg6UVz5SOD3GVw,NV,"[36.20743, -115.26846]"
7,OD2hnuuTJI9uotcKycxg1A,NV,"[36.1974844, -115.2496601]"
8,EsMcGiZaQuG1OOvL9iUFug,PA,"[40.6151022445, -80.0913487465]"
9,TGWhGNusxyMaA4kQVBNeew,NV,"[36.0558252127, -115.04635039]"


In [15]:
train_businesses['proximity'] = 0
train_businesses['proximity_percent'] = 0
train_businesses.head()

Unnamed: 0,business_id,state,latitude,longitude,lat_long,proximity,proximity_percent
0,FYWN1wneV18bWNgQjJ2GNg,AZ,33.33069,-111.978599,"[33.3306902, -111.9785992]",0,0
1,He-G7vWjzVUysIKrfNbPUQ,PA,40.291685,-80.1049,"[40.2916853, -80.1048999]",0,0
2,KQPW8lFf1y5BT2MxiSZ3QA,AZ,33.524903,-112.11531,"[33.5249025, -112.1153098]",0,0
3,8DShNS-LuFqpEWIp0HxijA,AZ,33.383147,-111.964725,"[33.3831468, -111.96472539999999]",0,0
4,PfOCPjBrlQAnz__NXj9h_w,OH,41.119535,-81.47569,"[41.1195346, -81.4756898]",0,0


In [10]:
dfs = []
for g, df in train_businesses.groupby(['state']):
    dfs.append(df)

In [12]:
print(len(dfs))

67


In [17]:
np.where(np.isnan(train_businesses['latitude']))

(array([136097], dtype=int64),)

In [19]:
print(train_businesses.ix[136097])

business_id          W1x0rlzGUrMBbK3Hq5bk2Q
state                                    ON
latitude                                NaN
longitude                               NaN
lat_long                         [nan, nan]
proximity                                 0
proximity_percent                         0
Name: 136097, dtype: object


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [21]:
f = FloatProgress(min=0, max=len(dfs))
display(f)

for df in dfs:
    state_count = len(df)
    btree = BallTreeIndex(df['lat_long'].tolist())
    
    for index, row in df.iterrows():  
        try:
            nearby_count = btree.query_radius(row['lat_long'])
            train_businesses.set_value(index, 'proximity', nearby_count)
            train_businesses.set_value(index, 'proximity_percent', nearby_count/state_count * 100)
        except ValueError:
            print("Found business {} with invalid lat_long {}".format(index, row['lat_long']))
            pass
    
    f.value += 1

A Jupyter Widget

Found business 136097 with invalid lat_long [nan, nan]


In [24]:
train_businesses.ix[6]

business_id          kCoE3jvEtg6UVz5SOD3GVw
state                                    NV
latitude                            36.2074
longitude                          -115.268
lat_long             [36.20743, -115.26846]
proximity                             25406
proximity_percent                        76
Name: 6, dtype: object

In [25]:
train_businesses.head()

Unnamed: 0,business_id,state,latitude,longitude,lat_long,proximity,proximity_percent
0,FYWN1wneV18bWNgQjJ2GNg,AZ,33.33069,-111.978599,"[33.3306902, -111.9785992]",21976,42
1,He-G7vWjzVUysIKrfNbPUQ,PA,40.291685,-80.1049,"[40.2916853, -80.1048999]",4201,41
2,KQPW8lFf1y5BT2MxiSZ3QA,AZ,33.524903,-112.11531,"[33.5249025, -112.1153098]",25498,48
3,8DShNS-LuFqpEWIp0HxijA,AZ,33.383147,-111.964725,"[33.3831468, -111.96472539999999]",26003,49
4,PfOCPjBrlQAnz__NXj9h_w,OH,41.119535,-81.47569,"[41.1195346, -81.4756898]",1427,11


In [26]:
train_businesses.drop(['latitude', 'longitude', 'lat_long'], axis=1)

Unnamed: 0,business_id,state,proximity,proximity_percent
0,FYWN1wneV18bWNgQjJ2GNg,AZ,21976,42
1,He-G7vWjzVUysIKrfNbPUQ,PA,4201,41
2,KQPW8lFf1y5BT2MxiSZ3QA,AZ,25498,48
3,8DShNS-LuFqpEWIp0HxijA,AZ,26003,49
4,PfOCPjBrlQAnz__NXj9h_w,OH,1427,11
5,o9eMRCWt5PkpLDE0gOPtcQ,BW,3065,98
6,kCoE3jvEtg6UVz5SOD3GVw,NV,25406,76
7,OD2hnuuTJI9uotcKycxg1A,NV,26586,80
8,EsMcGiZaQuG1OOvL9iUFug,PA,2819,27
9,TGWhGNusxyMaA4kQVBNeew,NV,23025,69


In [27]:
train_businesses.to_csv('./businesses_proximity.csv', index=False)

In [6]:
businesses = pd.read_csv('businesses_proximity.csv')

In [7]:
businesses.head()

Unnamed: 0,business_id,state,latitude,longitude,lat_long,proximity,proximity_percent
0,FYWN1wneV18bWNgQjJ2GNg,AZ,33.33069,-111.978599,"[33.3306902, -111.9785992]",21976,42
1,He-G7vWjzVUysIKrfNbPUQ,PA,40.291685,-80.1049,"[40.2916853, -80.1048999]",4201,41
2,KQPW8lFf1y5BT2MxiSZ3QA,AZ,33.524903,-112.11531,"[33.5249025, -112.1153098]",25498,48
3,8DShNS-LuFqpEWIp0HxijA,AZ,33.383147,-111.964725,"[33.3831468, -111.96472539999999]",26003,49
4,PfOCPjBrlQAnz__NXj9h_w,OH,41.119535,-81.47569,"[41.1195346, -81.4756898]",1427,11


In [8]:
np.count_nonzero(businesses['proximity'])

174565

In [9]:
len(businesses)

174567

In [8]:
import find_nearby
from multiprocessing import Process, Queue, Pool

In [9]:
output = Queue()

dfs = []
for g, df in train_businesses[0:8].groupby(np.arange(len(train_businesses[0:8])) // 4): # 14548
    print(df.head())
    print("df size {}".format(df.shape))
    dfs.append((df, output, train_businesses))

results = []
processes = []
for df in dfs:
    p = Process(target=find_nearby.find_nearby, args=df)
    processes.append(p)
    p.start()

for p in processes:
    result = output.get(p)
    results.append(result)
    print("\n result: \n{}".format(result))

              business_id state   latitude   longitude
0  FYWN1wneV18bWNgQjJ2GNg    AZ  33.330690 -111.978599
1  He-G7vWjzVUysIKrfNbPUQ    PA  40.291685  -80.104900
2  KQPW8lFf1y5BT2MxiSZ3QA    AZ  33.524903 -112.115310
3  8DShNS-LuFqpEWIp0HxijA    AZ  33.383147 -111.964725
df size (4, 4)
              business_id state   latitude  longitude
4  PfOCPjBrlQAnz__NXj9h_w    OH  41.119535  -81.47569
5  o9eMRCWt5PkpLDE0gOPtcQ    BW  48.727200    9.14795
6  kCoE3jvEtg6UVz5SOD3GVw    NV  36.207430 -115.26846
7  OD2hnuuTJI9uotcKycxg1A    NV  36.197484 -115.24966
df size (4, 4)

 result: 
              business_id state   latitude  longitude  proximity  \
4  PfOCPjBrlQAnz__NXj9h_w    OH  41.119535  -81.47569     1359.0   
5  o9eMRCWt5PkpLDE0gOPtcQ    BW  48.727200    9.14795     2983.0   
6  kCoE3jvEtg6UVz5SOD3GVw    NV  36.207430 -115.26846    23363.0   
7  OD2hnuuTJI9uotcKycxg1A    NV  36.197484 -115.24966    25037.0   

   proximity_percent  
4           0.107780  
5           0.956703  
6   