In [1]:
import pandas as pd
import re
import time
from geopy.geocoders import Nominatim

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

In [3]:
locator = Nominatim(user_agent = 'rohit_venkat')


def clean_up_address(address):
    addr, city, state, zipcode = address[['address_first_line', 'city', 'state', 'zipcode']]
    addr = re.split(' APT | BLDG | BOX | BUILDING | FL | RM | STE | SUITE | UNIT | #', addr)[0]
    addr = re.sub('1005 D.B. TODD BLVD|1005 DR DB TODD JR BLVD|1005 DR. D. B. TODD BLVD|1005 DR. DB JR BLVD', 'MEHARRY MEDICAL COLLEGE', addr)
    addr = re.sub('.*1161 21ST.*', '1161 21ST AVE S', addr)
    addr = re.sub('3443 DICKERSON PIKE|3443 DICKERSON PK', 'TRISTAR SKYLINE MEDICAL CENTER', addr)
    addr = re.sub('3601 THE VANDERBILT.*|.*VANDERBILT CLINIC.*|.*TVC.*', 'THE VANDERBILT CLINIC', addr)
    addr = re.sub('.*OXFORD HOUSE.*', '1313 21ST AVE S', addr)
    addr = re.sub('.*MEDICAL CENTER NORTH.*|.*MEDICAL CTR N.*|.*MCN.*', '1161 21ST AVE S', addr)
    addr = re.sub('.*MEDICAL CENTER EAST.*|.*MED CTR E.*|.*MCE.*', '1215 21ST AVE S', addr)
    addr = re.sub('VANDERBILT CHILDREN.*', '2200 CHILDRENS WAY', addr)
    city = re.sub('NASHVILE|NASVHILLE', 'NASHVILLE', city)

    return addr + ', ' + city + ', ' + state + ', ' + zipcode


def geocode_lookup(address):
    try:
        time.sleep(0.1)
        location = locator.geocode(address)

        try:
            zipcode = re.findall('Tennessee, ([0-9]{5})', location.address)[0]
            return location.address, zipcode, location.latitude, location.longitude

        except:
            return location.address, None, location.latitude, location.longitude
            
    except:
        return None, None, None, None

In [4]:
tn_providers = pd.read_csv('../data/tn_providers.csv', dtype = str)
hop_teaming = pd.read_csv('../data/tn_hop_teaming.csv', dtype = {'from_npi': str, 'to_npi': str})

In [5]:
# Addresses: 11,407; Matches: 9,465; Runtime: 253m 23.6s
tn_team_hopping_addresses = tn_providers[tn_providers['npi'].isin(hop_teaming['from_npi'].append(hop_teaming['to_npi']).reset_index(drop = True))]
tn_team_hopping_addresses = tn_team_hopping_addresses[['address_first_line', 'address_second_line', 'city', 'state', 'zipcode']].drop_duplicates()
tn_team_hopping_addresses = tn_team_hopping_addresses.assign(geo_query = tn_team_hopping_addresses.apply(clean_up_address, axis = 1))
tn_team_hopping_addresses[['geo_address', 'geo_zipcode', 'lat', 'long']] = pd.DataFrame(tn_team_hopping_addresses['geo_query'].apply(geocode_lookup).tolist(), index = tn_team_hopping_addresses.index)
tn_team_hopping_addresses.to_csv('../data/tn_team_hopping_addresses.csv', index = False)

mismatched = tn_team_hopping_addresses[(~tn_team_hopping_addresses['geo_address'].isnull()) & (tn_team_hopping_addresses['geo_zipcode'] != tn_team_hopping_addresses['zipcode'])]
mismatched.to_csv('../data/tn_team_hopping_mismatched_addresses.csv', index = False)

unmatched = tn_team_hopping_addresses[tn_team_hopping_addresses['geo_address'].isnull()]
unmatched.to_csv('../data/tn_team_hopping_unmatched_addresses.csv', index = False)

In [6]:
# Addresses: 8,771; Matches: 6,651 Runtime: 120m 11.1s
tn_provider_addresses = tn_providers[tn_providers['npi'].isin(hop_teaming['from_npi'])]
tn_provider_addresses = tn_provider_addresses[['address_first_line', 'address_second_line', 'city', 'state', 'zipcode']].drop_duplicates()
tn_provider_addresses = tn_provider_addresses.assign(geo_query = tn_provider_addresses.apply(clean_up_address, axis = 1))
tn_provider_addresses[['geo_address', 'geo_zipcode', 'lat', 'long']] = pd.DataFrame(tn_team_hopping_addresses['geo_query'].apply(geocode_lookup).tolist(), index = tn_team_hopping_addresses.index)
tn_provider_addresses.to_csv('../data/tn_provider_addresses.csv', index = False)

In [7]:
# Addresses: 4,822; Matches: 4,039; Runtime: 107m 21.1s
tn_facility_addresses = tn_providers[tn_providers['npi'].isin(hop_teaming['to_npi'])]
tn_facility_addresses = tn_facility_addresses[['address_first_line', 'address_second_line', 'city', 'state', 'zipcode']].drop_duplicates()
tn_facility_addresses = tn_facility_addresses.assign(geo_query = tn_facility_addresses.apply(clean_up_address, axis = 1))
tn_facility_addresses[['geo_address', 'geo_zipcode', 'lat', 'long']] = pd.DataFrame(tn_facility_addresses['geo_query'].apply(geocode_lookup).tolist(), index = tn_facility_addresses.index)
tn_facility_addresses.to_csv('../data/tn_facility_addresses.csv', index = False)

In [8]:
# Addresses: 1,155; Matches: 973; Runtime: 11m 10.3s
nashville_facility_addresses = tn_providers[(tn_providers['npi'].isin(hop_teaming['to_npi'])) & (tn_providers['cbsa'] == '34980')]
nashville_facility_addresses = nashville_facility_addresses[['address_first_line', 'address_second_line', 'city', 'state', 'zipcode']].drop_duplicates()
nashville_facility_addresses = nashville_facility_addresses.assign(geo_query = nashville_facility_addresses.apply(clean_up_address, axis = 1))
nashville_facility_addresses[['geo_address', 'geo_zipcode', 'lat', 'long']] = pd.DataFrame(nashville_facility_addresses['geo_query'].apply(geocode_lookup).tolist(), index = nashville_facility_addresses.index)
nashville_facility_addresses.to_csv('../data/nashville_facility_addresses.csv', index = False)