In [5]:
import geopandas as gpd
import pandas as pd

precinct_shapes = gpd.read_file('data/master_precinct_shapes.csv')
turfed_precincts = pd.read_csv('data/turfs_regions_base.csv', dtype={'van_precinct_id': str})


In [6]:
turfed_precincts.columns

Index(['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region',
       'Current Turf', 'Comments', 'HDs', 'voters', 'supporters', 'Unnamed: 9',
       'Unnamed: 10'],
      dtype='object')

In [7]:
precinct_shapes.columns

Index(['PrcnctName', 'van_precinct_name', 'CountyName', 'county_name',
       'CountyFIPS', 'county_fips', 'GEOID', 'van_precinct_id', 'WKT',
       'geometry'],
      dtype='object')

In [8]:
# filter only columns from precinct_shapes that we want
shapes_filtered = precinct_shapes[['van_precinct_id', 'PrcnctName', 'GEOID', 'geometry']].copy()

# rename PrcnctName to precinct_name_doe
shapes_filtered = shapes_filtered.rename(columns={'PrcnctName': 'precinct_name_doe'})

# filter only data we want from turfs_precincts
turfs_filtered = turfed_precincts[['van_precinct_id', 'county_name', 'van_precinct_name', 
                                  'Current Region', 'Current Turf', 'HDs', 'voters', 'supporters']].copy()

# left join the geometries onto the turf data
merged_gdf = turfs_filtered.merge(shapes_filtered, on='van_precinct_id', how='left')

# convert to gdf
merged_gdf = gpd.GeoDataFrame(merged_gdf, crs='EPSG:4326')

print(f"Original turf data: {len(turfed_precincts)} records")
print(f"Original shape data: {len(precinct_shapes)} records") 
print(f"Merged data: {len(merged_gdf)} records")
print(f"Records with geometry: {merged_gdf.geometry.notna().sum()}")
print(f"Records without geometry: {merged_gdf.geometry.isna().sum()}")
print(f"All turf records preserved: {len(merged_gdf) == len(turfed_precincts)}")

print("\nColumns in merged dataset:")
print(merged_gdf.columns.tolist())

print("\nFirst few rows:")
print(merged_gdf.head())

Original turf data: 2527 records
Original shape data: 2541 records
Merged data: 2535 records
Records with geometry: 2535
Records without geometry: 0
All turf records preserved: False

Columns in merged dataset:
['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region', 'Current Turf', 'HDs', 'voters', 'supporters', 'precinct_name_doe', 'GEOID', 'geometry']

First few rows:
  van_precinct_id        county_name                   van_precinct_name  \
0          297094  Alexandria (City)           103 - Lyles Crouch School   
1          297100  Alexandria (City)  109 - Fire Department Headquarters   
2          297095  Alexandria (City)                 104 - Durant Center   
3          297093  Alexandria (City)                     102 - City Hall   
4          297101  Alexandria (City)        201 - Naomi L. Brooks School   

     Current Region       Current Turf HDs  voters  supporters  \
0  R01 - Inner Nova  R01A - Alexandria   5    4035        3065   
1  R01 - Inner Nova 

In [9]:
# Check if any van_precinct_ids were lost in the merge
original_ids = set(turfed_precincts['van_precinct_id'])
merged_ids = set(merged_gdf['van_precinct_id'])

missing_ids = original_ids - merged_ids
print(f"Original turf data had {len(original_ids)} unique van_precinct_ids")
print(f"Merged data has {len(merged_ids)} unique van_precinct_ids")
print(f"Missing van_precinct_ids: {len(missing_ids)}")

if missing_ids:
    print(f"\nThe following {len(missing_ids)} van_precinct_ids were lost:")
    for missing_id in sorted(missing_ids):
        print(f"  {missing_id}")
else:
    print("\n✓ All van_precinct_ids from original turf data are preserved in the merge!")

Original turf data had 2527 unique van_precinct_ids
Merged data has 2527 unique van_precinct_ids
Missing van_precinct_ids: 0

✓ All van_precinct_ids from original turf data are preserved in the merge!


In [17]:
merged_gdf.to_csv('output/turfed_precincts_pre_AOK.csv', index=False)

In [14]:
# Check which geometries from the original shapes data are missing in the merged data
original_shape_ids = set(precinct_shapes['van_precinct_id'])
merged_ids = set(merged_gdf['van_precinct_id'])

missing_shapes = original_shape_ids - merged_ids

print(f"Original shapes data had {len(original_shape_ids)} unique van_precinct_ids")
print(f"Merged data has {len(merged_ids)} unique van_precinct_ids") 
print(f"Shapes missing from merged data: {len(missing_shapes)}")

if missing_shapes:
    print(f"\nThe following {len(missing_shapes)} shapes were not included in the merge:")
    missing_shape_details = precinct_shapes[precinct_shapes['van_precinct_id'].isin(missing_shapes)]
    print(missing_shape_details[['van_precinct_id', 'PrcnctName', 'CountyName', 'GEOID']].to_string(index=False))
else:
    print("\n✓ All shapes from original data are in the merged dataset!")

Original shapes data had 2533 unique van_precinct_ids
Merged data has 2527 unique van_precinct_ids
Shapes missing from merged data: 6

The following 6 shapes were not included in the merge:
van_precinct_id    PrcnctName        CountyName       GEOID
                FAIRFAX COURT    FAIRFAX COUNTY 51059000700
         296106         Hayes GLOUCESTER COUNTY 51073000401
         296107  Sarahs Creek GLOUCESTER COUNTY 51073000402
         296568  PRECINCT 3-1   NOTTOWAY COUNTY 51135000301
        2228518        1 TOWN     ORANGE COUNTY 51137000103
        2228477        3 TOWN     ORANGE COUNTY 51137000303


In [15]:
# GEOIDs from the missing shapes
missing_geoids = ['51059000700', '51073000401', '51073000402', '51135000301', '51137000103', '51137000303']

# Filter the original shapes data by these GEOIDs
geoid_check = precinct_shapes[precinct_shapes['GEOID'].isin(missing_geoids)]

print(f"Checking {len(missing_geoids)} GEOIDs in original shapes data:")
print(geoid_check[['GEOID', 'van_precinct_id', 'PrcnctName', 'CountyName']].to_string(index=False))

# Check if any of these have null/empty van_precinct_id
null_van_ids = geoid_check[geoid_check['van_precinct_id'].isna() | (geoid_check['van_precinct_id'] == '')]
if len(null_van_ids) > 0:
    print(f"\n{len(null_van_ids)} of these have null/empty van_precinct_id:")
    print(null_van_ids[['GEOID', 'van_precinct_id', 'PrcnctName', 'CountyName']].to_string(index=False))
else:
    print("\nAll of these have assigned van_precinct_ids")

Checking 6 GEOIDs in original shapes data:
      GEOID van_precinct_id    PrcnctName        CountyName
51059000700                 FAIRFAX COURT    FAIRFAX COUNTY
51073000401          296106         Hayes GLOUCESTER COUNTY
51073000402          296107  Sarahs Creek GLOUCESTER COUNTY
51135000301          296568  PRECINCT 3-1   NOTTOWAY COUNTY
51137000103         2228518        1 TOWN     ORANGE COUNTY
51137000303         2228477        3 TOWN     ORANGE COUNTY

1 of these have null/empty van_precinct_id:
      GEOID van_precinct_id    PrcnctName     CountyName
51059000700                 FAIRFAX COURT FAIRFAX COUNTY


In [16]:
# The van_precinct_ids from the missing shapes (excluding the null one)
missing_van_ids = ['296106', '296107', '296568', '2228518', '2228477']

# Check if these exist in the turf data
for van_id in missing_van_ids:
    exists_in_turf = van_id in turfed_precincts['van_precinct_id'].values
    print(f"van_precinct_id {van_id} exists in turf data: {exists_in_turf}")

# Also check what's in turf data for these
turf_check = turfed_precincts[turfed_precincts['van_precinct_id'].isin(missing_van_ids)]
print(f"\nFound {len(turf_check)} of these van_precinct_ids in turf data:")
if len(turf_check) > 0:
    print(turf_check[['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region']].to_string(index=False))

van_precinct_id 296106 exists in turf data: False
van_precinct_id 296107 exists in turf data: False
van_precinct_id 296568 exists in turf data: False
van_precinct_id 2228518 exists in turf data: False
van_precinct_id 2228477 exists in turf data: False

Found 0 of these van_precinct_ids in turf data:
