In [32]:
import pandas as pd
import time
import numpy as np

In [19]:
bts = pd.read_csv('963420066_T_MASTER_CORD.csv')

In [26]:
df = (bts.set_index(['AIRPORT'])
        .query('AIRPORT_IS_LATEST==1')[['LATITUDE','LONGITUDE']]
        .dropna()
        .sample(n =500, random_state =42)
        .sort_index())

In [27]:
idx = pd.MultiIndex.from_product([df.index, df.index], names = ['orig', 'dest'])
pairs = pd.concat([df.add_suffix('_1').reindex(idx,level = 'orig'),
                   df.add_suffix('_2').reindex(idx,level = 'dest')],axis=1)

In [28]:
pairs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LATITUDE_1,LONGITUDE_1,LATITUDE_2,LONGITUDE_2
orig,dest,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1NY,1NY,42.636944,-77.052778,42.636944,-77.052778
1NY,6B0,42.636944,-77.052778,43.985556,-73.095556
1NY,A03,42.636944,-77.052778,58.4575,-154.023333
1NY,A06,42.636944,-77.052778,65.655556,-165.356389
1NY,A12,42.636944,-77.052778,60.785278,-158.864444


In [29]:
#Numpy version of calculating distance
def gcd_vec(lat1, lng1, lat2, lng2):
    '''
    Calculate great circle distance.
    http://www.johndcook.com/blog/python_longitude_latitude/

    Parameters
    ----------
    lat1, lng1, lat2, lng2: float or array of float

    Returns
    -------
    distance:
      distance from ``(lat1, lng1)`` to ``(lat2, lng2)`` in kilometers.
    '''
    # python2 users will have to use ascii identifiers
    ϕ1 = np.deg2rad(90 - lat1)
    ϕ2 = np.deg2rad(90 - lat2)

    θ1 = np.deg2rad(lng1)
    θ2 = np.deg2rad(lng2)

    cos = (np.sin(ϕ1) * np.sin(ϕ2) * np.cos(θ1 - θ2) +
           np.cos(ϕ1) * np.cos(ϕ2))
    arc = np.arccos(cos)
    return arc * 6373

#python version of calculating distance
import math

def gcd_py(lat1, lng1, lat2, lng2):
    '''
    Calculate great circle distance between two points.
    http://www.johndcook.com/blog/python_longitude_latitude/

    Parameters
    ----------
    lat1, lng1, lat2, lng2: float

    Returns
    -------
    distance:
      distance from ``(lat1, lng1)`` to ``(lat2, lng2)`` in kilometers.
    '''
    # python2 users will have to use ascii identifiers (or upgrade)
    degrees_to_radians = math.pi / 180.0
    ϕ1 = (90 - lat1) * degrees_to_radians
    ϕ2 = (90 - lat2) * degrees_to_radians

    θ1 = lng1 * degrees_to_radians
    θ2 = lng2 * degrees_to_radians

    cos = (math.sin(ϕ1) * math.sin(ϕ2) * math.cos(θ1 - θ2) +
           math.cos(ϕ1) * math.cos(ϕ2))
    # round to avoid precision issues on identical points causing ValueErrors
    cos = round(cos, 8)
    arc = math.acos(cos)
    return arc * 6373  # radius of earth, in kilometers

In [30]:
#First way of calculating distance for whole dataframe pairs
#applying python function on scalers by iterating on whole df
t1 = time.time()
pd.Series([gcd_py(*x) for x in pairs.itertuples(index =False)], index = pairs.index)
t2 = time.time()
print(t2-t1)

4.90146803855896


In [33]:
#Second way of calculating distance for whole dataset
#using vec function and calculating distance using vectors of long, latitudes

t1 = time.time()
pd.Series(gcd_vec(pairs['LATITUDE_1'], pairs['LONGITUDE_1'], pairs['LATITUDE_2'], pairs['LONGITUDE_2']), index =pairs.index)
t2 = time.time()
print(t2-t1)

0.13023114204406738




In [34]:
# Third way of calculating distance for whole dataset
#using df.apply function which iterates interally 
t1 = time.time()
pd.Series(pairs.apply(lambda x: gcd_py(x['LATITUDE_1'], x['LONGITUDE_1'],x['LATITUDE_2'], x['LONGITUDE_2']),axis=1))
t2 = time.time()
print(t2-t1)

90.77761936187744
