In [None]:
import sys
sys.path.append('../src/')

import pandas as pd
import requests
import numpy as np
import geocoder
import data_io
import utils
import time
import tokens

In [None]:

#locations that failed to geocode but could be filled by hand
WEIRD_LOC_DICT = {'westseneca':'west seneca, ny',
                 'linolakes': 'lino lakes, mn',
                 'newportrichey':'new port richey, fl',
                 'mayslick': 'mays lick, ky',
                 'boiseid': 'boise, id',
                 'dixonil': 'dixon, il',
                 'dallastx': 'dallas, tx',
                 '5779purpleleafcourtfrederickmd': 'frederick, md',
                 '07060northplainfieldnj': 'north plainfield, nj',
                 '08108haddontownshipnj': 'haddon township, nj',
                 '33134coralgablesfl': 'coral gables, fl',
                 '92587canyonlakeca': 'canyon lake, ca',
                 '95682cameronparkca': 'cameron park, ca',
                 '92075solanabeachca': 'solana beach, ca',
                 'de forest, wi': 'deforest, wi',
                 'horsesho, tx': 'horseshoe, tx',
                 'wesley c, fl': 'wesley chapel, fl'}

CENSUS_KEY = tokens.CENSUS_KEY
MB_TOKENS = tokens.MAPBOX_TOKENS


STATE_DICT = dict((v,k) for k,v in utils.us_state_abbrev.items())
CLEANED_FOLDER_NAME = 'mapbox_geocode'
NOT_FOUND_FILENAME = 'locs_not_found_master_recode_v2.csv'

In [None]:
import string
def contains_only_digits(s):
    # True for "", "0", "123"
    # False for "1.2", "1,2", "-1", "a", "a1"
    if type(s) != str:
        return True
    for ch in s:
        if not ch in string.digits:
            return False
    return True    

def has_numbers(inputString):
     return any(char.isdigit() for char in inputString)


def clean_location(x, other_loc, state):
    if contains_only_digits(other_loc) == False:
        if type(state) == str:
            state = state.lower()
        for k in WEIRD_LOC_DICT.keys():
            if k in x:
                x = WEIRD_LOC_DICT[k]
                return x
        if has_numbers(x) == True:
            x = ''.join(c for c in x if not c.isnumeric())
            if ' - ' in x:
                x = x.replace(" - ", "")
            if '-' in x:
                x = x.replace("-", "")
        if x[-4:] == ', us':
            x = x[0:len(x)-4]
        if ',' not in x and type(state)==str:
            comma_loc = x.find(state)
            old = x
            start = old[0:comma_loc]
            print(old, other_loc, state)
            try:
                if start[-1] == ' ':
                    start = start[0:len(start)-1]
                x = start +', '+old[comma_loc:]
                print(x)
            except:
                x = 'none'
            
        if 'a f b' in x:
            x = x.replace('a f b', 'air force base')
        if 'mc ' in x:
            if 'mc ,' not in x:
                x = x.replace('mc ', 'mc')
        if x[0]==' ':
            x = x[1:]
        
        
    return x

def clean_other_loc(x, other_loc, state):
    if contains_only_digits(other_loc) == False:
        if type(x) == str:
            if type(state) == str:

                state = state.lower()

            for k in WEIRD_LOC_DICT.keys():
                if k in x:
                    x = WEIRD_LOC_DICT[k]
            if has_numbers(x) == True:
                x = ''.join(c for c in x if not c.isnumeric())
                if ' - ' in x:
                    x = x.replace(" - ", "")
                if '-' in x:
                    x = x.replace("-", "")
            if x[-4:] == ', us':
                x = x[0:len(x)-4]
            if ',' not in x and type(state)==str:
                comma_loc = x.find(state)
                old = x
                start = old[0:comma_loc]
                print(old, other_loc, state)
                try:
                    if start[-1] == ' ':
                        start = start[0:len(start)-1]
                    x = start +', '+old[comma_loc:]
                    print(x)
                except:
                    x = 'none'

            if 'a f b' in x:
                x = x.replace('a f b', 'air force base')
            if 'mc ' in x:
                if 'mc ,' not in x:
                    x = x.replace('mc ', 'mc')
            if x[0]==' ':
                x = x[1:]
            
            x = x[0:len(x)-2]
        
        
    return x


In [None]:

def get_token(index):
    tokens =  MB_TOKENS
    if index % 2 == 0:
        return tokens[0]
    else:
        return tokens[1]
    
'''
GEOCODER FUNCTION
'''

def assign_county_fips(df, min_index = 0):
    start_time = time.time()
    not_found_df = pd.DataFrame()
    indexer = 0
    for i in range(min_index, len(df)):
        if df.loc[i, 'possible_canada'] == False:
            location = df.loc[i, 'cleaned_other_loc']
            
            if df.loc[i, 'state'] in STATE_DICT.keys():
                state_full = STATE_DICT[df.loc[i, 'state']]
                location = location + state_full
                time.sleep(0.5)
                token = get_token(i)
                g = geocoder.mapbox(location, key=token)
                g = g.json
                if g is None:
                    print('nothing returned!')
                else:

                    if g['status'] == 'OK':
                        if 'state' in g.keys():
                            if g['state'] == state_full and g['country'] == 'United States':
                                lat = g['lat']
                                lng = g['lng']
                                df.loc[i, 'lat'] = lat
                                df.loc[i, 'lng'] = lng
                                #get FIP code for corresponding lat/long
                                fcc_req = f'https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={lng}&showall=false&format=json&key={CENSUS_KEY}'
                                resp = requests.get(fcc_req)
                                try:
                                    result = resp.json()
                                except:
                                    print('error in fip retrieval: ', resp)
                                    print(type(resp))
                                try:
                                    state_county_fips = result['County']['FIPS']
                                    county = result['County']['name']
                                    df.loc[i, 'state_county_fips'] = state_county_fips
                                    df.loc[i, 'county'] = county
                                except:
                                    print('error, could not retrieve fips for: ', location)
                                    not_found_df.loc[indexer, 'cleaned_location'] = location
                                    not_found_df.loc[indexer, 'state'] = df.loc[i, 'state']
                                    not_found_df.loc[indexer, 'og_location'] = df.loc[i, 'location']
                        else:
                            print('state not in keys?')
                            print(g.keys())
                            print(location, state_full)
                    else:
                        print('status issue: ', g)
            else:
                print('state not in dict, ', i)
        if i % 100 == 0 and i > min_index:
            print(i)
            curr_time = time.time()
            print('min since start: ', (curr_time-start_time)/60)
            print('saving...')
            df.to_csv(data_io.input_cleaned/'geolocations'/f'loc_fill_progress_mapbox_{i}.csv',
                                     encoding='utf-8')
        
    
    not_found_df.to_csv(data_io.input_cleaned/'geolocations'/'rescrape_locs_not_found_v3.csv', encoding='utf-8',
                     index=False)
    return df
    
    

In [None]:
df = pd.read_excel(data_io.input_cleaned/'geolocations'/'unique_locations_to_scrape_all_years.xlsx', encoding='utf-8')

In [None]:
df.columns

In [None]:
df['cleaned_location'] = df.apply(lambda x: clean_location(x['location'], x['other_loc'], x['state']), axis = 1)

df['cleaned_other_loc'] = df.apply(lambda x: clean_other_loc(x['location'], x['other_loc'], x['state']), axis = 1)


In [None]:
outputs = assign_county_fips(df)
writer = pd.ExcelWriter(data_io.input_cleaned/'geolocations'/'cancer_unique_locs_w_fips_mapbox.xlsx',
                       engine='xlsxwriter', options = {'strings_to_urls':False, 'strings_to_formulas':False})
outputs.to_excel(writer, index=False, encoding='utf-8')
writer.close()