This script helps identify the AWC closest to each birth node. We will use the following steps - 
1. Cross-join the birth-node and AWC locations df to get all uniquebirth-node-AWC combination (lane)
2. Calculate the distance for each lane
3. Eliminate all lanes longer than 20 km. The threshold is arbitrarily selected to reduce the size of the dataframe
4. Sort the lanes in ascending order (shortest lanes on top)
5. Drop duplicates at birth-node level, keeping only the first record, which gives us the closest AWC to the birth-node

In [1]:
import pandas as pd
from pathlib import Path
path_data = Path.cwd().parent / data 

In [3]:
births_df = pd.read_csv(path_data / '01 out_births_cleaned.csv')
awc_locations_df = pd.read_csv(path_data / '02 in_awc_locations.csv')

STEP 1

In [2]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [4]:
# Data cleaning and prep dataframes for cross-join
births_df['Location ID'] = births_df.index

births_df['key'] = 1
awc_locations_df['key'] = 1

births_df = births_df.rename(columns = {'Latitude' : 'Birth Latitude', 'Longitude': 'Birth Longitude'})
awc_locations_df = awc_locations_df.rename(columns = {'Latitude' : 'AWC Latitude', 'Longitude' : 'AWC Longitude'})

lanes_df = births_df.merge(awc_locations_df, on='key', how='outer')

STEP 2

In [6]:
lanes_df['Distance'] = lanes_df.apply(lambda x: haversine(lon1 = x['Birth Longitude'], lat1 = x['Birth Latitude'],
                                                         lon2 = x['AWC Longitude'], lat2 = x['AWC Latitude']), axis = 1)

STEP 3

In [7]:
lanes_df = lanes_df[lanes_df['Distance']<=20]

STEP 4

In [8]:
lanes_df = lanes_df.sort_values(by=['Location ID', 'Distance'], ascending = [True, True])
lane_assignment_df = lanes_df.drop_duplicates('Location ID', keep='first')

STEP 5

In [10]:
lane_assignment_df.to_csv(path_data / '02 out_lane_assignment.csv', index = False)