In [1]:
import sys
sys.path.append('../src/')

import pandas as pd
import requests
import numpy as np
import geocoder
import data_io
import utils
import time
import tokens

In [2]:

#locations that failed to geocode but could be filled by hand
WEIRD_LOC_DICT = {'westseneca':'west seneca, ny',
                 'linolakes': 'lino lakes, mn',
                 'newportrichey':'new port richey, fl',
                 'mayslick': 'mays lick, ky',
                 'boiseid': 'boise, id',
                 'dixonil': 'dixon, il',
                 'dallastx': 'dallas, tx',
                 '5779purpleleafcourtfrederickmd': 'frederick, md',
                 '07060northplainfieldnj': 'north plainfield, nj',
                 '08108haddontownshipnj': 'haddon township, nj',
                 '33134coralgablesfl': 'coral gables, fl',
                 '92587canyonlakeca': 'canyon lake, ca',
                 '95682cameronparkca': 'cameron park, ca',
                 '92075solanabeachca': 'solana beach, ca',
                 'de forest, wi': 'deforest, wi',
                 'horsesho, tx': 'horseshoe, tx',
                 'wesley c, fl': 'wesley chapel, fl'}

CENSUS_KEY = tokens.CENSUS_KEY
MB_TOKENS = tokens.MAPBOX_TOKENS


STATE_DICT = dict((v,k) for k,v in utils.us_state_abbrev.items())
CLEANED_FOLDER_NAME = 'mapbox_geocode'
NOT_FOUND_FILENAME = 'locs_not_found_master_recode_v2.csv'

In [74]:
import string
def contains_only_digits(s):
    # True for "", "0", "123"
    # False for "1.2", "1,2", "-1", "a", "a1"
    if type(s) != str:
        return True
    for ch in s:
        if not ch in string.digits:
            return False
    return True    

def has_numbers(inputString):
     return any(char.isdigit() for char in inputString)


def clean_location(x, other_loc, state):
    if contains_only_digits(other_loc) == False:
        if type(state) == str:
            state = state.lower()
        for k in WEIRD_LOC_DICT.keys():
            if k in x:
                x = WEIRD_LOC_DICT[k]
                return x
        if has_numbers(x) == True:
            x = ''.join(c for c in x if not c.isnumeric())
            if ' - ' in x:
                x = x.replace(" - ", "")
            if '-' in x:
                x = x.replace("-", "")
        if x[-4:] == ', us':
            x = x[0:len(x)-4]
        if ',' not in x and type(state)==str:
            comma_loc = x.find(state)
            old = x
            start = old[0:comma_loc]
            print(old, other_loc, state)
            try:
                if start[-1] == ' ':
                    start = start[0:len(start)-1]
                x = start +', '+old[comma_loc:]
                print(x)
            except:
                x = 'none'
            
        if 'a f b' in x:
            x = x.replace('a f b', 'air force base')
        if 'mc ' in x:
            if 'mc ,' not in x:
                x = x.replace('mc ', 'mc')
        if x[0]==' ':
            x = x[1:]
        
        
    return x

def clean_other_loc(x, other_loc, state):
    if contains_only_digits(other_loc) == False:
        if type(x) == str:
            if type(state) == str:

                state = state.lower()

            for k in WEIRD_LOC_DICT.keys():
                if k in x:
                    x = WEIRD_LOC_DICT[k]
            if has_numbers(x) == True:
                x = ''.join(c for c in x if not c.isnumeric())
                if ' - ' in x:
                    x = x.replace(" - ", "")
                if '-' in x:
                    x = x.replace("-", "")
            if x[-4:] == ', us':
                x = x[0:len(x)-4]
            if ',' not in x and type(state)==str:
                comma_loc = x.find(state)
                old = x
                start = old[0:comma_loc]
                print(old, other_loc, state)
                try:
                    if start[-1] == ' ':
                        start = start[0:len(start)-1]
                    x = start +', '+old[comma_loc:]
                    print(x)
                except:
                    x = 'none'

            if 'a f b' in x:
                x = x.replace('a f b', 'air force base')
            if 'mc ' in x:
                if 'mc ,' not in x:
                    x = x.replace('mc ', 'mc')
            if x[0]==' ':
                x = x[1:]
            
            x = x[0:len(x)-2]
        
        
    return x


In [136]:

def get_token(index):
    tokens =  MB_TOKENS
    if index % 2 == 0:
        return tokens[0]
    else:
        return tokens[1]
    
'''
GEOCODER FUNCTION
'''

def assign_county_fips(df, min_index = 0):
    start_time = time.time()
    not_found_df = pd.DataFrame()
    indexer = 0
    for i in range(min_index, len(df)):
        if df.loc[i, 'possible_canada'] == False:
            location = df.loc[i, 'cleaned_other_loc']
            
            if df.loc[i, 'state'] in STATE_DICT.keys():
                state_full = STATE_DICT[df.loc[i, 'state']]
                location = location + state_full
                time.sleep(0.5)
                token = get_token(i)
                g = geocoder.mapbox(location, key=token)
                g = g.json
                if g is None:
                    print('nothing returned!')
                else:

                    if g['status'] == 'OK':
                        if 'state' in g.keys():
                            if g['state'] == state_full and g['country'] == 'United States':
                                lat = g['lat']
                                lng = g['lng']
                                df.loc[i, 'lat'] = lat
                                df.loc[i, 'lng'] = lng
                                #get FIP code for corresponding lat/long
                                fcc_req = f'https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={lng}&showall=false&format=json&key={CENSUS_KEY}'
                                resp = requests.get(fcc_req)
                                try:
                                    result = resp.json()
                                except:
                                    print('error in fip retrieval: ', resp)
                                    print(type(resp))
                                try:
                                    state_county_fips = result['County']['FIPS']
                                    county = result['County']['name']
                                    df.loc[i, 'state_county_fips'] = state_county_fips
                                    df.loc[i, 'county'] = county
                                except:
                                    print('error, could not retrieve fips for: ', location)
                                    not_found_df.loc[indexer, 'cleaned_location'] = location
                                    not_found_df.loc[indexer, 'state'] = df.loc[i, 'state']
                                    not_found_df.loc[indexer, 'og_location'] = df.loc[i, 'location']
                        else:
                            print('state not in keys?')
                            print(g.keys())
                            print(location, state_full)
                    else:
                        print('status issue: ', g)
            else:
                print('state not in dict, ', i)
        if i % 100 == 0 and i > min_index:
            print(i)
            curr_time = time.time()
            print('min since start: ', (curr_time-start_time)/60)
            print('saving...')
            df.to_csv(data_io.input_cleaned/'geolocations'/f'loc_fill_progress_mapbox_{i}.csv',
                                     encoding='utf-8')
        
    
    not_found_df.to_csv(data_io.input_cleaned/'geolocations'/'rescrape_locs_not_found_v3.csv', encoding='utf-8',
                     index=False)
    return df
    
    

In [34]:
df = pd.read_excel(data_io.input_cleaned/'geolocations'/'unique_locations_to_scrape_all_years.xlsx', encoding='utf-8')

In [35]:
df.columns

Index(['location', 'other_loc', 'state', 'possible_canada'], dtype='object')

In [75]:
df['cleaned_location'] = df.apply(lambda x: clean_location(x['location'], x['other_loc'], x['state']), axis = 1)

df['cleaned_other_loc'] = df.apply(lambda x: clean_other_loc(x['location'], x['other_loc'], x['state']), axis = 1)


va 3648va va
nrpa nr50pa pa
nr, pa
 newington nh 03801 newington nh nh
 newington, nh
 nm 87017 nm nm
, nm
brierwa 98036brierwa wa
brier, wa
agmo a0g4mo mo
ag, mo
kensingtonca 94708kensingtonca ca
kensington, ca
lighthousepointfl 33064lighthousepointfl fl
lighthousepoint, fl
philadelphiapa 19130philadelphiapa pa
philadelphia, pa
yardleypa 19067yardleypa pa
yardley, pa
nikolaevskak 99556nikolaevskak ak
nikolaevsk, ak
mawsonact 2607mawsonact ct
mawsona, ct
jamaicany 11417jamaicany ny
jamaica, ny
nambouraustralia 4560 - nambour - australia ia
nambouraustral, ia
jamesburgnj 08831jamesburgnj nj
jamesburg, nj
niskayunany 12309niskayunany ny
niskayuna, ny
cheyennewy 82007cheyennewy wy
cheyenne, wy
crestviewfl 32536crestviewfl fl
crestview, fl
wrightact 2611wrightact ct
wrighta, ct
superiorco 80027superiorco co
superior, co
atlantaga 30087atlantaga ga
atlanta, ga
dunlaptn 37327dunlaptn tn
dunlap, tn
reedsvillewv 26547reedsvillewv wv
reedsville, wv
va 3648va va
nrpa nr50pa pa
nr, pa
 newington 

In [90]:
outputs = assign_county_fips(df)
writer = pd.ExcelWriter(data_io.input_cleaned/'geolocations'/'cancer_unique_locs_w_fips_mapbox.xlsx',
                       engine='xlsxwriter', options = {'strings_to_urls':False, 'strings_to_formulas':False})
outputs.to_excel(writer, index=False, encoding='utf-8')
writer.close()

state not in keys?
beaconsfield, Washington Washington
100
min since start:  1.1332007924715677
saving...
state not in keys?
utakarra, Washington Washington
200
min since start:  2.183064393202464
saving...
nothing returned!
saving progress in case of issue with limits
state not in keys?
broomfie, Colorado Colorado
300
min since start:  3.243596355120341
saving...
state not in keys?
fremantle, Washington Washington
nothing returned!
saving progress in case of issue with limits
400
min since start:  4.509144985675812
saving...
state not in keys?
mundaring, Washington Washington
nothing returned!
saving progress in case of issue with limits


Status code Unknown from https://api.mapbox.com/geocoding/v5/mapbox.places/gallagher, West Virginia.json: ERROR - HTTPSConnectionPool(host='api.mapbox.com', port=443): Read timed out. (read timeout=5.0)


nothing returned!
saving progress in case of issue with limits
500
min since start:  5.806674396991729
saving...
600
min since start:  6.9984957297643025
saving...
state not in keys?
israel, Illinois Illinois
state not in keys?
attadale, Washington Washington
nothing returned!
saving progress in case of issue with limits


Status code 429 from https://api.mapbox.com/geocoding/v5/mapbox.places/perrin, Texas.json: ERROR - 429 Client Error: Unknown for url: https://api.mapbox.com/geocoding/v5/mapbox.places/perrin,%20Texas.json?access_token=pk.eyJ1IjoiamFja3AiLCJhIjoidGpzN0lXVSJ9.7YK6eRwUNFwd3ODZff6JvA


nothing returned!
saving progress in case of issue with limits
state not in keys?
perth bc, Washington Washington
700
min since start:  8.181849575042724
saving...
state not in keys?
nr, Pennsylvania Pennsylvania
nothing returned!
saving progress in case of issue with limits
800
min since start:  9.352406803766886
saving...
state not in keys?
mundijong, Washington Washington
nothing returned!
saving progress in case of issue with limits
900
min since start:  10.507759312788645
saving...
state not in keys?
kallaroo, Washington Washington
1000
min since start:  11.718306056658427
saving...
state not in keys?
petach tik, Illinois Illinois
1100
min since start:  12.905295677979787
saving...
state not in keys?
kardinya, Washington Washington
state not in keys?
voglo, California California
1200
min since start:  14.11553134918213
saving...
1300
min since start:  15.298754318555195
saving...
state not in keys?
kicc, California California
1400
min since start:  16.469980156421663
saving...
sta

Status code Unknown from https://api.mapbox.com/geocoding/v5/mapbox.places/bradford, Ohio.json: ERROR - HTTPSConnectionPool(host='api.mapbox.com', port=443): Read timed out. (read timeout=5.0)


nothing returned!
saving progress in case of issue with limits
2000
min since start:  23.592376430829365
saving...
state not in keys?
tzfat, Illinois Illinois
state not in keys?
cataby, Washington Washington
2100
min since start:  24.77429452339808
saving...
nothing returned!
saving progress in case of issue with limits
2200
min since start:  25.9578076839447
saving...
2300
min since start:  27.120320085684458
saving...
state not in keys?
kudardup, Washington Washington
2400
min since start:  28.31389262676239
saving...
2500
min since start:  29.59175401131312
saving...
state not in keys?
mooliabeenee, Washington Washington
2600
min since start:  30.83231415748596
saving...
2700
min since start:  32.09411746263504
saving...
2800
min since start:  33.29146919647852
saving...
nothing returned!
saving progress in case of issue with limits


Status code Unknown from https://api.mapbox.com/geocoding/v5/mapbox.places/terre hill, Pennsylvania.json: ERROR - HTTPSConnectionPool(host='api.mapbox.com', port=443): Read timed out. (read timeout=5.0)


nothing returned!
saving progress in case of issue with limits
2900
min since start:  34.58028984069824
saving...
3000
min since start:  35.90706319014232
saving...
3100
min since start:  37.371473689874016
saving...
3200
min since start:  38.57873911857605
saving...
state not in keys?
telaviv, Illinois Illinois
state not in keys?
spalding, Washington Washington
nothing returned!
saving progress in case of issue with limits
state not in keys?
ranford, Washington Washington
3300
min since start:  39.773038605848946
saving...
3400
min since start:  41.00050669511159
saving...
3500
min since start:  42.19309417804082
saving...
3600
min since start:  43.41201312541962
saving...
3700
min since start:  44.609592668215434
saving...
3800
min since start:  45.80174829959869
saving...
nothing returned!
saving progress in case of issue with limits
state not in keys?
hopetoun, Washington Washington
3900
min since start:  47.0204675078392
saving...
state not in keys?
yarloop, Washington Washington


Status code 429 from https://api.mapbox.com/geocoding/v5/mapbox.places/bennington, Indiana.json: ERROR - 429 Client Error: Unknown for url: https://api.mapbox.com/geocoding/v5/mapbox.places/bennington,%20Indiana.json?access_token=pk.eyJ1IjoiamFja3AiLCJhIjoidGpzN0lXVSJ9.7YK6eRwUNFwd3ODZff6JvA


nothing returned!
saving progress in case of issue with limits
state not in keys?
elachbutting, Washington Washington
state not in keys?
, New Mexico New Mexico
state not in keys?
julimar, Washington Washington
4300
min since start:  51.80333469708761
saving...
4400
min since start:  52.999394563833874
saving...
4500
min since start:  54.205543688933055
saving...
4600
min since start:  55.46230252981186
saving...
4700
min since start:  56.68309169610341
saving...
state not in keys?
beechina, Washington Washington
4800
min since start:  58.00776523351669
saving...
state not in keys?
wooroloo, Washington Washington
state not in keys?
carnarvon, Washington Washington
4900
min since start:  59.28966868321101
saving...
state not in keys?
kebaringup, Washington Washington
state not in keys?
gillimanning, Washington Washington
5000
min since start:  60.7744399189949
saving...
5100
min since start:  62.145903793970746
saving...
5200
min since start:  63.395285137494405
saving...
state not in k

Status code Unknown from https://api.mapbox.com/geocoding/v5/mapbox.places/singer, Louisiana.json: ERROR - HTTPSConnectionPool(host='api.mapbox.com', port=443): Read timed out. (read timeout=5.0)


nothing returned!
saving progress in case of issue with limits
state not in keys?
karnup, Washington Washington
5300
min since start:  64.68500730991363
saving...
state not in keys?
learmonth, Washington Washington


Status code Unknown from https://api.mapbox.com/geocoding/v5/mapbox.places/marion, Mississippi.json: ERROR - HTTPSConnectionPool(host='api.mapbox.com', port=443): Read timed out. (read timeout=5.0)


nothing returned!
saving progress in case of issue with limits
state not in keys?
leinster, Washington Washington
5400
min since start:  66.07200440565745
saving...
state not in keys?
grimwade, Washington Washington
5500
min since start:  67.30644034147262
saving...
5600
min since start:  68.51071870326996
saving...
nothing returned!
saving progress in case of issue with limits
nothing returned!
saving progress in case of issue with limits
state not in keys?
voxl , California California
nothing returned!
saving progress in case of issue with limits
5700
min since start:  69.62796162366867
saving...
state not in keys?
dalwallinu, Washington Washington
state not in keys?
ag, Missouri Missouri
nothing returned!
saving progress in case of issue with limits
5800
min since start:  70.80635107755661
saving...
state not in keys?
broomehill, Washington Washington
state not in keys?
mawsona, Connecticut Connecticut
nothing returned!
saving progress in case of issue with limits
5900
min since sta

Status code Unknown from https://api.mapbox.com/geocoding/v5/mapbox.places/courtland, California.json: ERROR - HTTPSConnectionPool(host='api.mapbox.com', port=443): Read timed out. (read timeout=5.0)


nothing returned!
saving progress in case of issue with limits
state not in keys?
yarragadee, Washington Washington
nothing returned!
saving progress in case of issue with limits
6300
min since start:  77.85727317333222
saving...
nothing returned!
saving progress in case of issue with limits
nothing returned!
saving progress in case of issue with limits
6400
min since start:  79.0476393143336
saving...
state not in keys?
wrighta, Connecticut Connecticut
nothing returned!
saving progress in case of issue with limits
state not in keys?
mullaloo, Washington Washington
nothing returned!
saving progress in case of issue with limits
6500
min since start:  80.22873662312826
saving...
nothing returned!
saving progress in case of issue with limits
nothing returned!
saving progress in case of issue with limits
nothing returned!
saving progress in case of issue with limits
6600
min since start:  81.15699141820272
saving...
nothing returned!
saving progress in case of issue with limits
nothing ret