In [1]:
import numpy as np

# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [2]:
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [4]:
import pandas as pd
file_name = "https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/new_york_hotels.csv"
df = pd.read_csv(file_name, encoding = "ISO-8859-1")
df.head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


In [5]:
df.shape

(1631, 11)

In [6]:
# This is a built in magic ipython command that provides timing for code executed in the cell
%%timeit

# Run the haversine looping function
df['distance'] = haversine_looping(df)

1 loop, best of 3: 609 ms per loop


In [7]:
%%timeit

# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

1 loop, best of 3: 219 ms per loop


In [8]:
%%timeit

# Timing apply on the Haversine function
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

10 loops, best of 3: 61 ms per loop


In [9]:
 !pip install line_profiler
 %load_ext line_profiler

# Haversine applied on rows with line profiler
%lprun -f haversine df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

Collecting line_profiler
[?25l  Downloading https://files.pythonhosted.org/packages/66/eb/417ace64f45fee7a0394946f8e1f90f925420fd9b14f1f09abb5284a0ca4/line_profiler-3.1.0-cp36-cp36m-manylinux2010_x86_64.whl (63kB)
[K     |█████▏                          | 10kB 18.5MB/s eta 0:00:01[K     |██████████▎                     | 20kB 25.2MB/s eta 0:00:01[K     |███████████████▍                | 30kB 21.7MB/s eta 0:00:01[K     |████████████████████▌           | 40kB 18.8MB/s eta 0:00:01[K     |█████████████████████████▋      | 51kB 15.0MB/s eta 0:00:01[K     |██████████████████████████████▊ | 61kB 17.0MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 5.4MB/s 
Installing collected packages: line-profiler
Successfully installed line-profiler-3.1.0


In [10]:
%%timeit 

# Vectorized implementation of Haversine applied on Pandas series
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

The slowest run took 13.24 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 2.65 ms per loop


In [11]:
# Run our line profiler to inspect further
%lprun -f haversine df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

In [12]:
%%timeit

# Vectorized implementation of Haversine applied on NumPy arrays (note we use .values to access the numpy series)
df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

The slowest run took 15.72 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 300 µs per loop
