In [152]:
# import libraries which needed
import pandas as pd
import numpy as np
import time

In [154]:
from scipy.spatial.distance import cdist

In [156]:
# Load the dataset
dat = pd.read_excel("clinics.xls")

In [158]:
#1. Tabulate the execution times of each of the individual approaches for computing distance in Python (i.e., run the shared code on your computer, note the times, and tabulate them).

In [160]:
def haversine(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

In [164]:
%%timeit

# Haversine applied on rows via the iteration
haversine_seriess = []
for index, row in df.iterrows():
    haversine_seriess.append(haversine(40.671, -73.985,\
                                      row['locLat'], row['locLong']))
df['distance'] = haversine_seriess

794 μs ± 15 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [166]:
%timeit df['distance'] =\
df.apply(lambda row: haversine(40.671, -73.985,\
                               row['locLat'], row['locLong']), axis=1)

468 μs ± 16.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [167]:
# Haversine applied on the rows
!pip install line_profiler
%load_ext line_profiler
%lprun -f haversine "df.apply(lambda row: haversine(40.671, -73.985, row['locLat'], row['locLong']), axis=1)"

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


Timer unit: 1e-07 s

Total time: 0 s
File: C:\Users\rathn\AppData\Local\Temp\ipykernel_23188\1170580481.py
Function: haversine at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def haversine(lat1, lon1, lat2, lon2):
     2                                               miles_constant = 3959
     3                                               lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
     4                                               dlat = lat2 - lat1 
     5                                               dlon = lon2 - lon1 
     6                                               
     7                                               a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
     8                                               c = 2 * np.arcsin(np.sqrt(a)) 
     9                                               mi = miles_constant * c
    10                        

In [170]:
# Vectorizedd implementation of Haversine Applied on Pandas series.
%timeit df['distance'] = haversine(40.671, -73.985,\
                                   df['locLat'], df['locLong'])

519 μs ± 15.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [172]:
# Vectorized implementation ,.
%lprun -f haversine "haversine(40.671, -73.985, df['locLat'], df['locLong'])"

Timer unit: 1e-07 s

Total time: 0 s
File: C:\Users\rathn\AppData\Local\Temp\ipykernel_23188\1170580481.py
Function: haversine at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def haversine(lat1, lon1, lat2, lon2):
     2                                               miles_constant = 3959
     3                                               lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
     4                                               dlat = lat2 - lat1 
     5                                               dlon = lon2 - lon1 
     6                                               
     7                                               a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
     8                                               c = 2 * np.arcsin(np.sqrt(a)) 
     9                                               mi = miles_constant * c
    10                        

In [176]:
# Vectorized implementation of The Haversine applied on NumPy Arrays..

%timeit df['distance'] = haversine(40.671, -73.985,\
                         df['locLat'].values, df['locLong'].values)

48.4 μs ± 1.47 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [178]:
%%timeit

# Convert pandas arrays to NumPy ndarrays.
np_lati = df['locLat'].values
np_long = df['locLong'].values

3.39 μs ± 46 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [180]:
%lprun -f haversine "df['distance'] = haversine(40.671, -73.985, df['locLat'].values, df['locLong'].values)"

Timer unit: 1e-07 s

Total time: 0 s
File: C:\Users\rathn\AppData\Local\Temp\ipykernel_23188\1170580481.py
Function: haversine at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def haversine(lat1, lon1, lat2, lon2):
     2                                               miles_constant = 3959
     3                                               lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
     4                                               dlat = lat2 - lat1 
     5                                               dlon = lon2 - lon1 
     6                                               
     7                                               a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
     8                                               c = 2 * np.arcsin(np.sqrt(a)) 
     9                                               mi = miles_constant * c
    10                        

In [182]:
!pip install cython

%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [186]:
%%cython -a

# Haversine cythonized.
import numpy as np
cpdef haversine_cy(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mile = miles_constant * c
    return mile

Content of stdout:
_cython_magic_7303c6f1b5b9d4754cff845aabd5de16b1748f67.c
   Creating library C:\Users\rathn\.ipython\cython\Users\rathn\.ipython\cython\_cython_magic_7303c6f1b5b9d4754cff845aabd5de16b1748f67.cp312-win_amd64.lib and object C:\Users\rathn\.ipython\cython\Users\rathn\.ipython\cython\_cython_magic_7303c6f1b5b9d4754cff845aabd5de16b1748f67.cp312-win_amd64.exp
Generating code
Finished generating code

In [188]:
%timeit df['distance'] =\
       df.apply(lambda row: haversine_cy(40.671, -73.985,\
                row['locLat'], row['locLong']), axis=1)

467 μs ± 25.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [190]:
%%cython -a
# Haversine cythonized
from libc.math cimport sin, cos, acos, asin, sqrt

cdef deg2rad_cy(float deg):
    cdef float rad
    rad = 0.01745329252*deg
    return rad
    
cpdef haversine_cy_dtyped(float lat1, float lon1, float lat2, float lon2):
    cdef: 
        float dlon
        float dlat
        float a
        float c
        float mi
    
    lat1, lon1, lat2, lon2 = map(deg2rad_cy, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    mil = 3959 * c
    return mil

Content of stdout:
_cython_magic_b4dc6073b86c252ee2026de6792e7d3b60634678.c
   Creating library C:\Users\rathn\.ipython\cython\Users\rathn\.ipython\cython\_cython_magic_b4dc6073b86c252ee2026de6792e7d3b60634678.cp312-win_amd64.lib and object C:\Users\rathn\.ipython\cython\Users\rathn\.ipython\cython\_cython_magic_b4dc6073b86c252ee2026de6792e7d3b60634678.cp312-win_amd64.exp
Generating code
Finished generating code

In [192]:
%timeit df['distance'] =\
df.apply(lambda row: haversine_cy_dtyped(40.671, -73.985,\
                              row['locLat'], row['locLong']), axis=1)

281 μs ± 11.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
