# Load cached results of US Census batch geocoding

In [3]:
import pandas as pd
import io
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, PrimaryKeyConstraint


In [4]:
import glob

path = '/opt/data/GIS-Census/florida/geocoding'
files = glob.glob(f"{path}/*.csv")

# Define column names
column_names = ['voterid', 'address_in', 'match_indicator', 'match_type', 'address_out', 'long_lat', 'tiger_edge', 'street_side', 'fips_state', 'fips_county', 'census_tract', 'census_block']

# Define the data types for each column
column_types = {
    'voterid': 'int64',
    'address_in': 'str',
    'match_indicator': 'str',
    'match_type': 'str',
    'address_out': 'str',
    'long_lat': 'str',
    'tiger_edge': 'str',
    'street_side': 'str',
    'fips_state': 'str',
    'fips_county': 'str',
    'census_tract': 'str',
    'census_block': 'str'
}

# chatgpt suggestion/alternative
# Define the metadata and table with primary key constraint
# metadata = MetaData()
# table = Table(
#     'geolocations', metadata,
#     Column('voterid', Integer, primary_key=True),  # This is the index column from the DataFrame
#     Column('match_indicator', String),
#     Column('match_type', String),
#     Column('address_in', String),
#     Column('address_out', String),
#     Column('long_lat', String),
#     Column('tiger_edge', String),
#     Column('street_side', String),
#     Column('fips_state', String),
#     Column('fips_county', String),
#     Column('census_tract', String),
#     Column('census_block', String),
    # PrimaryKeyConstraint('id')  # Define the primary key constraint
# )

In [44]:
df_geocoded = []
for f in files:
    csv = pd.read_csv(f, header=None, on_bad_lines='warn', low_memory=False, index_col=None, dtype=column_types, names=column_names,)
    df_matches = csv[csv['address_out'].isna() == False]
    print(f"File: {f} csv:({len(csv)}) -- matches:({len(df_matches)})")
    df_geocoded.append(df_matches)
    
# combine all the little df's together...    
df_geocoded = pd.concat(df_geocoded)
df_geocoded

File: /opt/data/GIS-Census/florida/geocoding/district_responses-69000.csv csv:(1000) -- matches:(951)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-233000.csv csv:(1000) -- matches:(965)
File: /opt/data/GIS-Census/florida/geocoding/data_result_7.csv csv:(5000) -- matches:(3640)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-6000.csv csv:(1000) -- matches:(976)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-309000.csv csv:(1000) -- matches:(965)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-241000.csv csv:(1000) -- matches:(969)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-445000.csv csv:(1000) -- matches:(533)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-264000.csv csv:(1000) -- matches:(961)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-203000.csv csv:(1000) -- matches:(981)
File: /opt/data/GIS-Census/florida/geocoding/district_responses-245000.csv csv:(1000) -

Unnamed: 0,voterid,address_in,match_indicator,match_type,address_out,long_lat,tiger_edge,street_side,fips_state,fips_county,census_tract,census_block
0,103817568,"12566 Mission Hills Cir S, Jacksonville, FL, 3...",Match,Exact,"12566 MISSION HILLS CIR S, JACKSONVILLE, FL, 3...","-81.4959208485605,30.37552925684173",83941007,L,12,031,014334,2001
1,132459240,"1548 Oak Ridge Dr W, Jacksonville, FL, 32225",Match,Exact,"1548 OAK RIDGE DR W, JACKSONVILLE, FL, 32225","-81.5175612592775,30.347330152139023",83936709,L,12,031,014328,2003
2,116077807,"11643 Dunes Way Dr N, Jacksonville, FL, 32225",Match,Exact,"11643 DUNES WAY DR N, JACKSONVILLE, FL, 32225","-81.50784565362517,30.360735718988167",640433643,R,12,031,014333,1005
3,103529009,"12547 Masters Ridge Dr, Jacksonville, FL, 32225",Match,Exact,"12547 MASTERS RIDGE DR, JACKSONVILLE, FL, 32225","-81.49696020758995,30.372375202129547",83986461,L,12,031,014334,2001
4,130864686,"12496 Masters Ridge Dr, Jacksonville, FL, 32225",Match,Exact,"12496 MASTERS RIDGE DR, JACKSONVILLE, FL, 32225","-81.49796247944619,30.37238874038468",83986461,R,12,031,014334,2001
...,...,...,...,...,...,...,...,...,...,...,...,...
994,103504015,"14135 Ivylgail Dr N, Jacksonville, FL, 32225",Match,Exact,"14135 IVYLGAIL DR N, JACKSONVILLE, FL, 32225","-81.47185563759984,30.36685643354997",83906113,L,12,031,014330,1008
996,103750897,"13783 Quartz Crystal Dr, Jacksonville, FL, 32225",Match,Exact,"13783 QUARTZ CRYSTAL DR, JACKSONVILLE, FL, 32225","-81.48136082294849,30.358010796389294",83980602,L,12,031,014335,2004
997,118368272,"2759 McCormick Woods Dr, Jacksonville, FL, 32225",Match,Exact,"2759 MCCORMICK WOODS DR, JACKSONVILLE, FL, 32225","-81.48335930815978,30.36120952365991",83941000,R,12,031,014334,2009
998,103516226,"14064 Broken Bow Dr S, Jacksonville, FL, 32225",Match,Exact,"14064 BROKEN BOW DR S, JACKSONVILLE, FL, 32225","-81.47444713016063,30.355822053000622",83980944,R,12,031,014335,2000


In [52]:
df_geocoded['geocoder'] = 'US Census'
df_geocoded['placeid'] = ''
df_geocoded['geoid'] = df_geocoded['fips_state'] +df_geocoded['fips_county'] + df_geocoded['census_tract'] + df_geocoded['census_block']
# Geocoding file count:(670) --> record count:(654778)      -- was:(761218 with non-matches)
print(f"Geocoding file count:({len(files)}) --> record count:({len(df_geocoded)})")

Geocoding file count:(670) --> record count:(654778)


In [53]:
# matched: 654778, unmatched: 106440
df_geo = df_geocoded[df_geocoded['address_out'].isna() == False]
print(f"matched:({len(df_geo)})")

matched:(654778)


In [56]:
# duplicates(?): 222635 (full), 105175 (matched only)
# keep=first
df_geo.sort_values(by='voterid', inplace=True)
df_dups = df_geo[df_geo.duplicated(subset=['voterid'], keep='first')]

In [57]:
# 597078
df_undupped = df_geo.drop_duplicates(subset=['voterid'], keep='first')
df_undupped

Unnamed: 0,voterid,address_in,match_indicator,match_type,address_out,long_lat,tiger_edge,street_side,fips_state,fips_county,census_tract,census_block,geocoder,placeid,geoid
559,100002687,"7044 Deer Lodge Cir APT 110, Jacksonville, FL,...",Match,Non_Exact,"7044 DEERLODGE CIR, JACKSONVILLE, FL, 32256","-81.59145747292257,30.244896884401214",628032509,R,12,031,016605,2011,US Census,,120310166052011
95,100005608,"6925 Cartier CIR, Jacksonville, FL, 32208",Match,Exact,"6925 CARTIER CIR, JACKSONVILLE, FL, 32208","-81.7057683536849,30.39448921709453",83891528,R,12,031,011300,2010,US Census,,120310113002010
375,100008211,"1401 Riverplace Blvd 2309, Jacksonville, FL, 3...",Match,Non_Exact,"1401 RIVERPLACE BLVD, JACKSONVILLE, FL, 32207","-81.65632528432121,30.318594816974723",639971428,L,12,031,000800,2001,US Census,,120310008002001
441,100008843,"2305 La Mesa Dr E, Jacksonville, FL, 32217",Match,Non_Exact,"2305 LA MESA DR, JACKSONVILLE, FL, 32217","-81.63521984804595,30.252000692785202",83916805,R,12,031,016500,4021,US Census,,120310165004021
380,100012958,"10061 Sweetwater Pkwy APT 239, Jacksonville, F...",Match,Exact,"10061 SWEETWATER PKWY, JACKSONVILLE, FL, 32256","-81.50636926182125,30.219390011992516",650105568,L,12,031,014424,3010,US Census,,120310144243010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,132789129,"3614 Effee St, Jacksonville, FL, 32209",Match,Exact,"3614 EFFEE ST, JACKSONVILLE, FL, 32209","-81.68961842825928,30.36020125138515",83893737,L,12,031,002801,2024,US Census,,120310028012024
67,132789169,"5226 Polan Ln, Jacksonville, FL, 32209",Match,Exact,"5226 POLAN LN, JACKSONVILLE, FL, 32209","-81.70664572654061,30.36692697597664",83891975,L,12,031,011600,1029,US Census,,120310116001029
24,132789187,"2332 Mindanao Dr, Jacksonville, FL, 32246",Match,Exact,"2332 MINDANAO DR, JACKSONVILLE, FL, 32246","-81.51870493941993,30.30774686474382",83941831,L,12,031,014338,2001,US Census,,120310143382001
432,132789210,"4535 120Th St 8, Jacksonville, FL, 32244",Match,Non_Exact,"4535 120TH ST, JACKSONVILLE, FL, 32244","-81.6999902517232,30.232380908053415",605436429,R,12,031,013301,1015,US Census,,120310133011015


In [59]:
# 654778 x 15
df_matched = df_geocoded[df_geocoded['match_indicator']=='Match']
print(f"Matched row count:({len(df_matched)})")

df_matched = df_undupped[df_undupped['match_indicator']=='Match']
print(f"Matched UnDupped row count:({len(df_matched)})")

df_matched

Matched row count:(654778)
Matched UnDupped row count:(597078)


Unnamed: 0,voterid,address_in,match_indicator,match_type,address_out,long_lat,tiger_edge,street_side,fips_state,fips_county,census_tract,census_block,geocoder,placeid,geoid
559,100002687,"7044 Deer Lodge Cir APT 110, Jacksonville, FL,...",Match,Non_Exact,"7044 DEERLODGE CIR, JACKSONVILLE, FL, 32256","-81.59145747292257,30.244896884401214",628032509,R,12,031,016605,2011,US Census,,120310166052011
95,100005608,"6925 Cartier CIR, Jacksonville, FL, 32208",Match,Exact,"6925 CARTIER CIR, JACKSONVILLE, FL, 32208","-81.7057683536849,30.39448921709453",83891528,R,12,031,011300,2010,US Census,,120310113002010
375,100008211,"1401 Riverplace Blvd 2309, Jacksonville, FL, 3...",Match,Non_Exact,"1401 RIVERPLACE BLVD, JACKSONVILLE, FL, 32207","-81.65632528432121,30.318594816974723",639971428,L,12,031,000800,2001,US Census,,120310008002001
441,100008843,"2305 La Mesa Dr E, Jacksonville, FL, 32217",Match,Non_Exact,"2305 LA MESA DR, JACKSONVILLE, FL, 32217","-81.63521984804595,30.252000692785202",83916805,R,12,031,016500,4021,US Census,,120310165004021
380,100012958,"10061 Sweetwater Pkwy APT 239, Jacksonville, F...",Match,Exact,"10061 SWEETWATER PKWY, JACKSONVILLE, FL, 32256","-81.50636926182125,30.219390011992516",650105568,L,12,031,014424,3010,US Census,,120310144243010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,132789129,"3614 Effee St, Jacksonville, FL, 32209",Match,Exact,"3614 EFFEE ST, JACKSONVILLE, FL, 32209","-81.68961842825928,30.36020125138515",83893737,L,12,031,002801,2024,US Census,,120310028012024
67,132789169,"5226 Polan Ln, Jacksonville, FL, 32209",Match,Exact,"5226 POLAN LN, JACKSONVILLE, FL, 32209","-81.70664572654061,30.36692697597664",83891975,L,12,031,011600,1029,US Census,,120310116001029
24,132789187,"2332 Mindanao Dr, Jacksonville, FL, 32246",Match,Exact,"2332 MINDANAO DR, JACKSONVILLE, FL, 32246","-81.51870493941993,30.30774686474382",83941831,L,12,031,014338,2001,US Census,,120310143382001
432,132789210,"4535 120Th St 8, Jacksonville, FL, 32244",Match,Non_Exact,"4535 120TH ST, JACKSONVILLE, FL, 32244","-81.6999902517232,30.232380908053415",605436429,R,12,031,013301,1015,US Census,,120310133011015


# Save US Census batch geocoded records to table
**NOTE:** this drops and recreates the table, beware! 

In [60]:
host = 'localhost'
port = 5437
db = 'geocoder'
engine = create_engine(f'postgresql://sean:pass1234@{host}:{port}/{db}')

In [61]:
table = 'geolocations'
df_matched.head(0).to_sql(table, engine, if_exists='replace', index=True)

0

In [62]:
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
df_matched.to_csv(output, sep='\t', header=False, index=True)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, table, null="")  # null values become ''
conn.commit()
cur.close()
conn.close()

# General sanity checking
numbers add up?? 

In [28]:
# 106440 x 15
df_unmatched = df_geocoded[df_geocoded['match_indicator']!='Match']
outfile = f"{path}/unmatched-census-geocoded.csv"
print(f"Unmatched row count:({len(df_unmatched)}) -- writing to:({outfile}")
df_unmatched.to_csv(outfile)
df_unmatched

Unmatched row count:(106440) -- writing to:(/opt/data/GIS-Census/florida/geocoding/unmatched-census-geocoded.csv


Unnamed: 0,voterid,address_in,match_indicator,match_type,address_out,long_lat,tiger_edge,street_side,fips_state,fips_county,census_tract,census_block,geocoder,placeid,geoid
22,103804235,"11148 Windhaven Dr S, Oscoda, FL, 48750",No_Match,,,,,,,,,,US Census,,
24,118682821,"3259 Brachenbury Ln, AUGUSTA, FL, 30909",No_Match,,,,,,,,,,US Census,,
30,117091742,"11216 Ft Caroline Rd, Jacksonville, FL, 32225",Tie,,,,,,,,,,US Census,,
31,103431942,"1960 Holly Oaks Ravine Dr, Callahan, FL, 32011",No_Match,,,,,,,,,,US Census,,
33,104097093,"3941 High Pine Rd, Jacksonville, FL, 32225",No_Match,,,,,,,,,,US Census,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,103283451,"14073 La Launa Cir N, Jacksonville, FL, 32225",No_Match,,,,,,,,,,US Census,,
934,128405388,"13919 Sugar Pine Ct, Virginia Beach, FL, 23455",No_Match,,,,,,,,,,US Census,,
942,124001212,"1760 Chandelier Cir W, FPO, FL, 9834",No_Match,,,,,,,,,,US Census,,
973,127921986,"13621 Mt Pleasant Rd, Jacksonville, FL, 32225",Tie,,,,,,,,,,US Census,,
