# My version of shared notes

In [None]:
!pip install line_profiler

Collecting line_profiler
  Downloading line_profiler-4.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading line_profiler-4.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (750 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.2/750.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: line_profiler
Successfully installed line_profiler-4.2.0


In [14]:
# Import necessary libraries
import numpy as np
import pandas as pd
import time
from math import radians, sin, cos, sqrt, atan2

# Load the dataset
clinic_data = pd.read_csv("clinic.csv")

In [12]:
clinic_data.columns

Index(['bizID', 'bizCat', 'bizCatSub', 'bizName', 'bizAddr', 'bizCity',
       'bizState', 'bizZip', 'bizPhone', 'bizFax', 'bizEmail', 'bizURL',
       'locAreaCode', 'locFIPS', 'locTimeZone', 'locDST', 'locLat', 'locLong',
       'locMSA', 'locPMSA', 'locCounty'],
      dtype='object')

In [15]:
clinic_data.head()

Unnamed: 0,bizID,bizCat,bizCatSub,bizName,bizAddr,bizCity,bizState,bizZip,bizPhone,bizFax,...,bizURL,locAreaCode,locFIPS,locTimeZone,locDST,locLat,locLong,locMSA,locPMSA,locCounty
0,1,Clinics,Clinics,Hino Ronald H MD,98-151 Pali Momi Street Suite 142,Aiea,HI,96701,(808)487-2477,,...,,808,15003,PST-2,N,21.398,-157.8981,3320.0,,Honolulu
1,2,Clinics,Clinics,Farmer Joesph F Md,1225 Breckenridge Drive,Little Rock,AR,72205,(501)225-2594,,...,,501,5119,CST,Y,34.7495,-92.3533,4400.0,,Pulaski
2,3,Clinics,Clinics & Medical Centers,Najjar Fadi Md,1155 West Linda Avenue Suite B,Hermiston,OR,97838,(541)289-1122,,...,,541,41059,PST,Y,45.8456,-119.2817,,,Umatilla
3,4,Clinics,Clinics & Medical Centers,Kittson Memorial Upper Level Nursing Home,1010 South Birch Avenue,Hallock,MN,56728,(218)843-2525,,...,,218,27069,CST,Y,48.7954,-97.009,,,Kittson
4,5,Clinics,Clinics & Medical Centers,Thompson Robert B Md,100 North Eagle Creek Drive,Lexington,KY,40509,(859)258-4000,,...,www.lexingtonclinic.com,859,21067,EST,Y,37.9935,-84.3712,4280.0,,Fayette


In [16]:
clinic_data['locLong'].head()

Unnamed: 0,locLong
0,-157.8981
1,-92.3533
2,-119.2817
3,-97.009
4,-84.3712


In [17]:
# Extract latitude and longitude
clinic_data = clinic_data[['locLat', 'locLong']].dropna()
coords = clinic_data.values  # [Modified]  Convert to NumPy array

#  Haversine function in km (not miles for my Convenience)
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    total_miles = MILES * c
    return total_miles

# [Modified]  Taking first log and lats as refrence
print(coords[0][1])
print(coords[0][0])

-157.8981
21.398


In [22]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


# Base

In [43]:
## Define a function to manually loop over all rows and return a series of distances
def haversine_looping(clinic_data):
    distance_list = []
    for i in range(0, len(clinic_data)):
        d = haversine(21.3980, -157.8981, clinic_data.iloc[i]['locLat'], clinic_data.iloc[i]['locLong'])
        distance_list.append(d)
    return distance_list
# [Modified] cProfile.run("clinic_data['distance'] = haversine_looping(clinic_data)")

%timeit clinic_data['distance'] = haversine_looping(clinic_data)


2.59 ms ± 62.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Iterrows Haversine

In [28]:
#%%% vectorize code by using series and iterrows
# Haversine applied on rows via iteration
%%timeit
haversine_series = []
for index, row in clinic_data.iterrows():
    haversine_series.append(haversine(21.3980, -157.8981, row['locLat'], row['locLong']))
#cProfile.run("df['distance'] = haversine_series")
clinic_data['distance'] = haversine_series

1.68 ms ± 94.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Apply on rows

In [32]:
# %%% Optimize further
# Timing apply on the Haversine function
%%timeit
clinic_data['distance'] = clinic_data.apply(lambda row: haversine(21.3980, -157.8981, row['locLat'], row['locLong']), \
                          axis=1)

%lprun -f haversine clinic_data.apply(lambda row: haversine(21.3980, -157.8981, row['locLat'], row['locLong']), axis=1)


4.35 ms ± 204 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Vectorized implementation of Haversine applied on Pandas series

In [33]:
%%timeit
# Vectorized implementation of Haversine applied on Pandas series
clinic_data['distance'] = haversine(21.3980, -157.8981, clinic_data['locLat'], clinic_data['locLong'])
%lprun -f haversine haversine(21.3980, -157.8981,\
                              clinic_data['locLat'], clinic_data['locLong'])
# cProfile.run("clinic_data['distance'] = haversine(21.3980, -157.8981, clinic_data['locLat'], clinic_data['locLong'])")



5.2 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Vectorized implementation of Haversine applied on NumPy arrays

In [34]:
%%timeit
#  Vectorized implementation of Haversine applied on NumPy arrays
clinic_data['distance'] = haversine(21.3980, -157.8981, clinic_data['locLat'].values, clinic_data['locLong'].values)
%lprun -f haversine clinic_data['distance'] = haversine(21.3980, -157.8981,\
                        clinic_data['locLat'].values, clinic_data['locLong'].values)
#  cProfile.run("clinic_data['distance'] = haversine(21.3980, -157.8981, clinic_data['locLat'].values, clinic_data['locLong'].values)")

1.86 ms ± 193 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Cythonized loop (unaltered)

In [35]:
%load_ext cython

In [36]:
%%cython -a

# Haversine cythonized (no other edits)
import numpy as np
cpdef haversine_cy(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    mi = miles_constant * c
    return mi

In [38]:
%timeit clinic_data['distance'] =\
       clinic_data.apply(lambda row: haversine_cy(21.3980, -157.8981,\
                row['locLat'], row['locLong']), axis=1)

1.06 ms ± 60.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Redefined with data types and C libraries

In [39]:
%%cython -a
# Haversine cythonized
from libc.math cimport sin, cos, acos, asin, sqrt

cdef deg2rad_cy(float deg):
    cdef float rad
    rad = 0.01745329252*deg
    return rad

cpdef haversine_cy_dtyped(float lat1, float lon1, float lat2, float lon2):
    cdef:
        float dlon
        float dlat
        float a
        float c
        float mi

    lat1, lon1, lat2, lon2 = map(deg2rad_cy, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    mi = 3959 * c
    return mi

In [41]:
%timeit clinic_data['distance'] =\
clinic_data.apply(lambda row: haversine_cy_dtyped(21.3980, -157.8981,\
                              row['locLat'], row['locLong']), axis=1)

961 µs ± 261 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
