In [1]:
#%%% imports
import numpy as np
import pandas as pd
import cProfile

In [3]:
#%% read in the data
df = pd.read_excel("/content/clinics.xls")
print(df.head())

   bizID   bizCat                  bizCatSub  \
0      1  Clinics                    Clinics   
1      2  Clinics                    Clinics   
2      3  Clinics  Clinics & Medical Centers   
3      4  Clinics  Clinics & Medical Centers   
4      5  Clinics  Clinics & Medical Centers   

                                     bizName  \
0                           Hino Ronald H MD   
1                         Farmer Joesph F Md   
2                             Najjar Fadi Md   
3  Kittson Memorial Upper Level Nursing Home   
4                       Thompson Robert B Md   

                             bizAddr      bizCity bizState  bizZip  \
0  98-151 Pali Momi Street Suite 142         Aiea       HI   96701   
1            1225 Breckenridge Drive  Little Rock       AR   72205   
2     1155 West Linda Avenue Suite B    Hermiston       OR   97838   
3            1010 South Birch Avenue      Hallock       MN   56728   
4        100 North Eagle Creek Drive    Lexington       KY   40509   

 

In [4]:
#%% define the distance computation function
# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    total_miles = MILES * c
    return total_miles

In [6]:
#%% define a function to compute distance, using a for loop
# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['locLat'], df.iloc[i]['locLong'])
        distance_list.append(d)
    return distance_list
cProfile.run("df['distance'] = haversine_looping(df)")

         10114 function calls (9926 primitive calls) in 0.014 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <frozen _collections_abc>:262(__subclasshook__)
        1    0.000    0.000    0.000    0.000 <frozen _collections_abc>:78(_check_methods)
        1    0.000    0.000    0.000    0.000 <frozen abc>:117(__instancecheck__)
        1    0.000    0.000    0.000    0.000 <frozen abc>:121(__subclasscheck__)
       30    0.001    0.000    0.001    0.000 <ipython-input-4-a33fdf68bf52>:3(haversine)
        1    0.000    0.000    0.012    0.012 <ipython-input-6-546d603f3255>:3(haversine_looping)
        1    0.000    0.000    0.014    0.014 <string>:1(<module>)
        1    0.000    0.000    0.001    0.001 __init__.py:225(compile)
        1    0.000    0.000    0.001    0.001 __init__.py:272(_compile)
        3    0.000    0.000    0.000    0.000 __init__.py:34(using_copy_on_write

In [7]:
#%%% vectorize code by using series and iterrows
# Haversine applied on rows via iteration
import time
start = time.time()
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['locLat'], row['locLong']))
df['distance'] = haversine_series
print("Execution Time: ", time.time() - start)

Execution Time:  0.0038170814514160156


In [8]:
#%%% Optimize further

# Timing apply on the Haversine function
start = time.time()
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['locLat'], row['locLong']), axis=1)
print("Execution Time: ", time.time() - start)

Execution Time:  0.003109455108642578


In [9]:
# Vectorized implementation of Haversine applied on Pandas series
start = time.time()
df['distance'] = haversine(40.671, -73.985, df['locLat'], df['locLong'])
print("Execution Time: ", time.time() - start)

Execution Time:  0.0020990371704101562


In [10]:
# Vectorized implementation of Haversine applied on NumPy arrays
start = time.time()
df['distance'] = haversine(40.671, -73.985, df['locLat'].values, df['locLong'].values)
print("Execution Time: ", time.time() - start)

cProfile.run("df['distance'] = haversine(40.671, -73.985, df['locLat'].values, df['locLong'].values)")

Execution Time:  0.0005898475646972656
         231 function calls (225 primitive calls) in 0.001 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <ipython-input-4-a33fdf68bf52>:3(haversine)
        1    0.000    0.000    0.001    0.001 <string>:1(<module>)
        9    0.000    0.000    0.000    0.000 __init__.py:34(using_copy_on_write)
        3    0.000    0.000    0.000    0.000 __init__.py:42(warn_copy_on_write)
        3    0.000    0.000    0.000    0.000 base.py:3777(get_loc)
        3    0.000    0.000    0.000    0.000 base.py:5323(__contains__)
        4    0.000    0.000    0.000    0.000 base.py:5373(__getitem__)
        3    0.000    0.000    0.000    0.000 base.py:6672(_maybe_cast_indexer)
        1    0.000    0.000    0.000    0.000 base.py:82(shape)
        3    0.000    0.000    0.000    0.000 base.py:84(<genexpr>)
        1    0.000    0.000    0.000    0.000 