In [1]:
import numpy as np
import pandas as pd
from math import *


# “Magic” commands available through Jupyter
• IPython notebooks only
Provide additional functionality on top of Python code
Start with % (executed on just the line) or %% (executed on the entire cell)

In [2]:
#Timing function runs with %timeit
def my_func():
    result=0
    for i in range(1,11):
        result+=i**2
    return result

%timeit my_func

35.2 ns ± 0.512 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [3]:
df=pd.read_csv('Documents/new_york_hotels.csv', encoding = 'cp1252')

In [4]:
df.head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


In [5]:
def haversine(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad,\
                            [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) *\
        np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    mi = miles_constant * c
    return mi

In [6]:

# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

# Runtime for 1600 rows
Distance from sample location with Latitude=40.671, Longitude = -73.985

In [7]:
%%timeit

df['distance'] = haversine_looping(df)

735 ms ± 40.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Looping with iterrows()

In [8]:
%%timeit
haversine_series=[]
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671,-73.985, row['latitude'], row['longitude']))
df['distance']=haversine_series

245 ms ± 12.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Is apply()  faster than iterrows()?

In [9]:
%%timeit
df['distance']=df.apply(lambda row: haversine(40.671,-73.985, row['latitude'], row['longitude']), axis=1)

112 ms ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Vectorized Function

In [16]:
%%timeit 
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

2.03 ms ± 59.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# NumPy arrays can be used in place of Pandas series when the additional functionality offered by Pandas series isn’t critical.

In [17]:
%%timeit

# Vectorized implementation of Haversine applied on NumPy arrays
df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

379 µs ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
