In [None]:
# !pip install -U googlemaps

In [None]:
# !pip install censusgeocode

In [1]:
import googlemaps
from datetime import datetime

import censusgeocode as cg
import numpy as np
import pandas as pd
import json
import redis
import re

In [None]:
key = open('/Users/elainewei/Desktop/google-api.txt').read()

gmaps = googlemaps.Client(key=key)

In [None]:
# caching
redis_client = redis.Redis(host = 'localhost', port = 6379, db = 0)

In [None]:
def get_fips(string, update=False):
    print(string)
    if not isinstance(string, str):
        return None, None
    elif re.search(r'\(.*\)', string):
        # string in parentheses are not locations
        error = 'Not a location'
        return None, error
    else:
        location_result = fetch_place(string, update) # might need a try except
        if len(location_result) == 0:
            error = 'No matched places from Google'
            return None, error
        elif len(location_result) > 1:
            error = "More than one matched place from Google"
            return None, error
        elif not check_county(location_result):
            error = "No county matched to string from Google"
            return None, error
        else: 
            try: 
                c = cg.coordinates(x=location_result[0]['geometry']['location']['lng'], 
                                    y=location_result[0]['geometry']['location']['lat'])
                return c["Counties"][0]["GEOID"], None
            except:
                error = "Census geocoder error"
                return None, error
            

def check_county(location_result):
    address_components = location_result[0]['address_components']
    for component in address_components:
        if 'administrative_area_level_2' in component['types']:
            return True
        # Some cities are not subjected to any counties in the place information, e.g., NYC, St. Louis, Richmond
        elif 'locality' in component['types']:
            return True
    return False


In [None]:
def fetch_place(string, update:bool = False):
    """
    takes in a string and get the json data of the place. If not found in cache then would 
    call the google map API to fetch data.
    """
    
    place_key = string
    place = redis_client.get(string)
    
    if update:
        place = None
    
    if not place:
        print('Could not find place in cache. Retrieving from Google Maps API...')
        place = gmaps.geocode(string)
        redis_client.set(place_key, json.dumps(place))
    
    else:
        print('Found place in cache, serving from redis...')
        place = json.loads(place)
        
    return place

In [None]:
get_fips('Oak Park, Cook County, Ill.', True)

In [15]:
df = pd.read_csv('results/sample_timeline.csv')

In [16]:
i = 1

while f'location_{i}' in df.columns:
    df["fips" + str(i)], df["error" + str(i)] = zip(*df["location_" + str(i)].map(get_fips))
    df.to_csv('results/sample_timeline_fips.csv', index=False)
    i += 1

print("Finished: ")
display(df)

NameError: name 'get_fips' is not defined

In [2]:
df = pd.read_csv('results/sample_timeline_fips.csv')

In [3]:
df = df[df.columns.drop(list(df.filter(regex='location')))]
df = df[df.columns.drop(list(df.filter(regex='error')))]

In [4]:
df

Unnamed: 0,id,start_1,end_1,start_2,end_2,start_3,end_3,start_4,end_4,start_5,...,fips6,fips7,fips8,fips9,fips10,fips11,fips12,fips13,fips14,fips15
0,E000293,1959,1977,1977,1981,1981,1985,1985,2021,2005,...,,,,,,,,,,
1,K000394,1982,2000,2000,2002,2002,2004,2004,2010,2010,...,,,,,,,,,,
2,T000478,1961,1979,1979,1983,1983,1987,1987,2011,2011,...,,,,,,,,,,
3,M000523,1955,1974,1974,1978,1978,1984,1984,1988,1988,...,,,,,,,,,,
4,F000479,1969,1987,1987,1991,1991,1993,1993,1999,1999,...,42003.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,T000467,1959,1981,1981,1988,1988,1990,1990,1995,1995,...,,,42027.0,,,,,,,
96,C001117,1971,1989,1989,1993,1993,1998,1998,2019,2019,...,,,,,,,,,,
97,A000378,1965,1983,1983,1987,1987,2002,2002,2019,2019,...,,,,,,,,,,
98,M001210,1963,1981,1981,1985,1985,1989,1989,2015,2015,...,,,,,,,,,,


In [88]:
# reshaping the dataframe
df_long = pd.wide_to_long(df, ['start_', 'end_', 'fips'], i="id", j="number")
df_long = df_long.reset_index()
df_long = df_long[df_long.columns.drop(['number'])]
df_long = df_long.rename(columns={'start_': 'start', 'end_': 'end'})

df_long = df_long.dropna(subset=['fips'])

In [89]:
# fixing type of year
for col in ['start', 'end']:
    df_long[col] = df_long[col].replace(['present', 'Present', 'Current'], 2023)
    df_long[col] = df_long[col].replace('?', np.nan)
    df_long[col] = pd.to_numeric(df_long[col], downcast='integer')

df_long['end'] = df_long['end'].fillna(df_long['start'])


In [90]:
df_long['length'] = df_long['end'].sub(df_long['start'])
df_long['length'] = pd.to_numeric(df_long['length'], downcast='integer')

In [91]:
df_long

Unnamed: 0,id,start,end,fips,length
0,E000293,1959,1977.0,17031.0,18
1,K000394,1982,2000.0,25025.0,18
2,T000478,1961,1979.0,36065.0,18
3,M000523,1955,1974.0,13121.0,19
4,F000479,1969,1987.0,42011.0,18
...,...,...,...,...,...
1068,C001077,2019,2023.0,8005.0,4
1081,C001085,2001,2001.0,26163.0,0
1123,W000154,1979,2009.0,51510.0,30
1223,W000154,2021,2021.0,51013.0,0


In [92]:
max_len = max(df_long['length'])
new_columns = [f'year{i}' for i in range(1, max_len+1)]

# Create new columns filled with NaN values
df_long[new_columns] = pd.DataFrame([[pd.NaT] * (max_len)] * len(df_long), index=df_long.index)

for idx, row in df_long.iterrows():
    start_year = row['start']
    length = row['length']
    for i in range(1, length):
        df_long.at[idx, new_columns[i-1]] = start_year + i
    if length != 0:
        df_long.at[idx, new_columns[length-1]] = df_long.loc[idx]['end']

df_long = df_long.rename(columns={'start': 'year0'})
df_long = df_long.drop(['end', 'length'], axis='columns')


In [93]:
years = [f'year{i}' for i in range(max_len+1)]

# reshape the dataframe into a longer format
df_long2 = df_long.melt(id_vars=['id', 'fips'], value_vars=years, value_name='year')
df_long2 = df_long2.drop(['variable'], axis='columns')
df_long2 = df_long2.dropna(subset=['year'])
df_long2 = df_long2.sort_values(by=['id', 'year'])
df_long2 = df_long2[['id', 'year', 'fips']]
df_long2 = df_long2.drop_duplicates()

Unnamed: 0,id,year,fips
49,A000361,1946,22049.0
418,A000361,1947,22049.0
787,A000361,1948,22049.0
1156,A000361,1949,22049.0
1525,A000361,1950,22049.0
...,...,...,...
2176,Z000017,2019,11001.0
2545,Z000017,2020,11001.0
2914,Z000017,2021,11001.0
3283,Z000017,2022,11001.0


In [95]:
df_long2

Unnamed: 0,id,year,fips
49,A000361,1946,22049.0
418,A000361,1947,22049.0
787,A000361,1948,22049.0
1156,A000361,1949,22049.0
1525,A000361,1950,22049.0
...,...,...,...
2176,Z000017,2019,11001.0
2545,Z000017,2020,11001.0
2914,Z000017,2021,11001.0
3283,Z000017,2022,11001.0


In [96]:
df_long2.to_csv('results/sample_location_fips.csv', index=False)

In [97]:
sample_df = pd.read_csv('results/sample_bios_after109nd.csv') # get the sample set since some were deleted in the dropna process
old_df = pd.read_csv('Lawmaker Location by County-Year, BG and CD (Best Version, June 2019).csv')
new_df = pd.read_csv('results/sample_location_fips.csv')

In [98]:
sample = list(sample_df.id)

In [105]:
filtered_old_df = old_df[old_df['bioguide_id'].isin(sample)]
filtered_old_df = filtered_old_df.drop_duplicates()
filtered_new_df = new_df[new_df['id'].isin(sample)]

In [116]:
filtered_new_df.drop_duplicates()

Unnamed: 0,id,year,fips
0,A000361,1946.0,22049.0
1,A000361,1947.0,22049.0
2,A000361,1948.0,22049.0
3,A000361,1949.0,22049.0
4,A000361,1950.0,22049.0
...,...,...,...
3625,Z000017,2019.0,11001.0
3626,Z000017,2020.0,11001.0
3627,Z000017,2021.0,11001.0
3628,Z000017,2022.0,11001.0


In [111]:
compare_df = filtered_new_df.merge(filtered_old_df, left_on=['id', 'year'], right_on=['bioguide_id', 'year'], how='outer', suffixes=('_new', '_old'))

In [107]:
compare_df

Unnamed: 0,id,year,fips_new,bioguide_id,last_name,first_name,icpsr_id,fips_old,concernflag.bgcd,concernflag.urap
0,A000361,1946.0,22049.0,A000361,Alexander,Rodney,90327.0,22049,0.0,0.0
1,A000361,1947.0,22049.0,,,,,,,
2,A000361,1948.0,22049.0,,,,,,,
3,A000361,1949.0,22049.0,,,,,,,
4,A000361,1950.0,22049.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
4939,,2006.0,,Z000017,Zeldin,Lee,21539.0,,0.0,0.0
4940,,2007.0,,Z000017,Zeldin,Lee,21539.0,,0.0,0.0
4941,,2008.0,,Z000017,Zeldin,Lee,21539.0,,0.0,0.0
4942,,2009.0,,Z000017,Zeldin,Lee,21539.0,,0.0,0.0


In [112]:
compare_df['id'] = compare_df['id'].fillna(compare_df['bioguide_id'])

columns = ['id', 'year', 'fips_new', 'fips_old']
compare_df = compare_df[columns]
compare_df = compare_df.sort_values(['id', 'year'])

In [114]:
compare_df.to_csv('results/sample_compare_fips.csv', index=False)