In [2]:
import requests
import json
import redis
import pandas as pd
from urllib.parse import urlencode

In [3]:
# caching
redis_client = redis.Redis(host = 'localhost', port = 6379, db = 0)

In [4]:
api_key = "the key"

In [5]:
def extract_lat_lng(address, address_type = 'administrative_area_level_2', update = False):
    """
    Takes in an address and find the latitude and longitude of the county it belongs
    """
    
    place = fetch_place(address, update)
    
    try: 

        place_type = place['types'][0]
        initial_place_type = place_type
        
        # type is larger than a county, e.g. state, country    
        if place_type in ('country', 'administrative_area_level_1'):
            place = None
            
        # type is smaller than a county, e.g., city, street
        elif place_type in ('administrative_area_level_3', 'locality', 'route', 'establishment', 'neighborhood', 'airport', 'colloquial_area', 'political'):
            county, i = '', 0             
            while (not county) and (i < 5):
                if place['address_components'][i]['types'][0] == 'administrative_area_level_2':
                    county = place['address_components'][i]['short_name']
                    state = place['address_components'][i+1]['short_name']
                i += 1
            address = f"{county}, {state}"
            place = fetch_place(address, update)
        
        elif place_type != address_type:
            print(address, place_type)
            
        else: # place_type == address_type
            pass
            
        county = place
        location = place['geometry']['location']
        lat = location['lat']
        lng = location['lng']
        return [address, initial_place_type, lat, lng]

    except Exception as e:
        print(e)
        print(place)
        return None

In [6]:
def fetch_place(address, update:bool = False):
    """
    takes in an address and get the json data of the place. If not found in cache then would 
    call the google map API to fetch data.
    """
    
    place_key = f"{address}_place"
    place = redis_client.get(place_key)
    
    if update:
        place = None
    
    if not place:
        print('Could not find place in cache. Retrieving from Google Maps API...')
        endpoint = f"https://maps.googleapis.com/maps/api/geocode/json"
        params = {"address": address, "key": api_key}
        url_params = urlencode(params)
        url = f"{endpoint}?{url_params}"
        r = requests.get(url)
        if r.status_code not in range(200, 299):
            place = {}
        elif results:
            place = r.json()['results'][0]
        
        redis_client.set(place_key, json.dumps(place))
    
    else:
        print('Found place in cache, serving from redis...')
        place = json.loads(place)
        
    return place

In [7]:
# use the first 100 as the sample
df = pd.read_csv("results/bioguide_birth_places_schools.csv")
birthplaces = df["birthplace"][:100]
locations = [None] * len(birthplaces)
tick = 0

for i in range(len(birthplaces)):
    birthplace = birthplaces[i]
    if type(birthplace) is not str:
        pass
    else:
        location = extract_lat_lng(birthplaces[i])
        locations[i] = location

Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving from redis...
Found place in cache, serving fr

In [8]:
# number of locality as initial input unit
locality_num = 0
for location in locations:
    try:
        if location[1] == 'locality':
            locality_num += 1
    except TypeError:
        pass
locality_num

81

In [17]:
for i in range(len(locations)):
    if locations[i] is None:
        locations[i] = [None] * 4
df_locations = pd.DataFrame(locations, columns = ['address', 'initial_type', 'lat', 'lng'])
df_locations

Unnamed: 0,address,initial_type,lat,lng
0,"Jefferson County, IA",locality,41.023636,-91.909924
1,"Dallas County, AL",locality,32.233214,-87.142289
2,"Douglas County, NE",locality,41.314812,-96.195132
3,"Todd County, Ky",administrative_area_level_2,36.833864,-87.142289
4,"Cambria County, PA",locality,40.489423,-78.747621
...,...,...,...,...
95,"Sacramento County, CA",locality,38.474670,-121.354163
96,"Bucks County, PA",locality,40.410796,-75.247906
97,"Northumberland County, PA",locality,40.867434,-76.687470
98,"Dallas County, TX",locality,32.802468,-96.835100


In [18]:
missing = 0
for lat in df_locations['lat']:
    if pd.isna(lat):
        missing += 1
missing

2