In [14]:
import geopandas as gpd
import pandas as pd

precinct_shapes = gpd.read_file('data/master_precinct_shapes.csv')
turfed_precincts = pd.read_csv('data/turfs_regions_base.csv', dtype={'van_precinct_id': str})


In [15]:
turfed_precincts.columns

Index(['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region',
       'Current Turf', 'Comments', 'HDs', 'voters', 'supporters', 'Unnamed: 9',
       'Unnamed: 10'],
      dtype='object')

In [16]:
precinct_shapes.columns

Index(['PrcnctName', 'van_precinct_name', 'CountyName', 'county_name',
       'CountyFIPS', 'county_fips', 'GEOID', 'van_precinct_id', 'WKT',
       'geometry'],
      dtype='object')

In [17]:
# filter only columns from precinct_shapes that we want
shapes_filtered = precinct_shapes[['van_precinct_id', 'PrcnctName', 'GEOID', 'geometry']].copy()

# rename PrcnctName to precinct_name_doe
shapes_filtered = shapes_filtered.rename(columns={'PrcnctName': 'precinct_name_doe'})

# filter only data we want from turfs_precincts
turfs_filtered = turfed_precincts[['van_precinct_id', 'county_name', 'van_precinct_name', 
                                  'Current Region', 'Current Turf', 'HDs', 'voters', 'supporters']].copy()

# left join the geometries onto the turf data
merged_gdf = turfs_filtered.merge(shapes_filtered, on='van_precinct_id', how='left')

# convert to gdf
merged_gdf = gpd.GeoDataFrame(merged_gdf, crs='EPSG:4326')

print(f"Original turf data: {len(turfed_precincts)} records")
print(f"Original shape data: {len(precinct_shapes)} records") 
print(f"Merged data: {len(merged_gdf)} records")
print(f"Records with geometry: {merged_gdf.geometry.notna().sum()}")
print(f"Records without geometry: {merged_gdf.geometry.isna().sum()}")
print(f"All turf records preserved: {len(merged_gdf) == len(turfed_precincts)}")

print("\nColumns in merged dataset:")
print(merged_gdf.columns.tolist())

print("\nFirst few rows:")
print(merged_gdf.head())

Original turf data: 2527 records
Original shape data: 2541 records
Merged data: 2535 records
Records with geometry: 2535
Records without geometry: 0
All turf records preserved: False

Columns in merged dataset:
['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region', 'Current Turf', 'HDs', 'voters', 'supporters', 'precinct_name_doe', 'GEOID', 'geometry']

First few rows:
  van_precinct_id        county_name                   van_precinct_name  \
0          297094  Alexandria (City)           103 - Lyles Crouch School   
1          297100  Alexandria (City)  109 - Fire Department Headquarters   
2          297095  Alexandria (City)                 104 - Durant Center   
3          297093  Alexandria (City)                     102 - City Hall   
4          297101  Alexandria (City)        201 - Naomi L. Brooks School   

     Current Region       Current Turf HDs  voters  supporters  \
0  R01 - Inner Nova  R01A - Alexandria   5    4035        3065   
1  R01 - Inner Nova 

In [18]:
# Check if any van_precinct_ids were lost in the merge
original_ids = set(turfed_precincts['van_precinct_id'])
merged_ids = set(merged_gdf['van_precinct_id'])

missing_ids = original_ids - merged_ids
print(f"Original turf data had {len(original_ids)} unique van_precinct_ids")
print(f"Merged data has {len(merged_ids)} unique van_precinct_ids")
print(f"Missing van_precinct_ids: {len(missing_ids)}")

if missing_ids:
    print(f"\nThe following {len(missing_ids)} van_precinct_ids were lost:")
    for missing_id in sorted(missing_ids):
        print(f"  {missing_id}")
else:
    print("\n✓ All van_precinct_ids from original turf data are preserved in the merge!")

Original turf data had 2527 unique van_precinct_ids
Merged data has 2527 unique van_precinct_ids
Missing van_precinct_ids: 0

✓ All van_precinct_ids from original turf data are preserved in the merge!


In [19]:
merged_gdf.to_csv('output/turfed_precincts_pre_AOK.csv', index=False)

In [20]:
# Check which geometries from the original shapes data are missing in the merged data
original_shape_ids = set(precinct_shapes['van_precinct_id'])
merged_ids = set(merged_gdf['van_precinct_id'])

missing_shapes = original_shape_ids - merged_ids

print(f"Original shapes data had {len(original_shape_ids)} unique van_precinct_ids")
print(f"Merged data has {len(merged_ids)} unique van_precinct_ids") 
print(f"Shapes missing from merged data: {len(missing_shapes)}")

if missing_shapes:
    print(f"\nThe following {len(missing_shapes)} shapes were not included in the merge:")
    missing_shape_details = precinct_shapes[precinct_shapes['van_precinct_id'].isin(missing_shapes)]
    print(missing_shape_details[['van_precinct_id', 'PrcnctName', 'CountyName', 'GEOID']].to_string(index=False))
else:
    print("\n✓ All shapes from original data are in the merged dataset!")

Original shapes data had 2533 unique van_precinct_ids
Merged data has 2527 unique van_precinct_ids
Shapes missing from merged data: 6

The following 6 shapes were not included in the merge:
van_precinct_id    PrcnctName        CountyName       GEOID
                FAIRFAX COURT    FAIRFAX COUNTY 51059000700
         296106         Hayes GLOUCESTER COUNTY 51073000401
         296107  Sarahs Creek GLOUCESTER COUNTY 51073000402
         296568  PRECINCT 3-1   NOTTOWAY COUNTY 51135000301
        2228518        1 TOWN     ORANGE COUNTY 51137000103
        2228477        3 TOWN     ORANGE COUNTY 51137000303


In [21]:
# GEOIDs from the missing shapes
missing_geoids = ['51059000700', '51073000401', '51073000402', '51135000301', '51137000103', '51137000303']

# Filter the original shapes data by these GEOIDs
geoid_check = precinct_shapes[precinct_shapes['GEOID'].isin(missing_geoids)]

print(f"Checking {len(missing_geoids)} GEOIDs in original shapes data:")
print(geoid_check[['GEOID', 'van_precinct_id', 'PrcnctName', 'CountyName']].to_string(index=False))

# Check if any of these have null/empty van_precinct_id
null_van_ids = geoid_check[geoid_check['van_precinct_id'].isna() | (geoid_check['van_precinct_id'] == '')]
if len(null_van_ids) > 0:
    print(f"\n{len(null_van_ids)} of these have null/empty van_precinct_id:")
    print(null_van_ids[['GEOID', 'van_precinct_id', 'PrcnctName', 'CountyName']].to_string(index=False))
else:
    print("\nAll of these have assigned van_precinct_ids")

Checking 6 GEOIDs in original shapes data:
      GEOID van_precinct_id    PrcnctName        CountyName
51059000700                 FAIRFAX COURT    FAIRFAX COUNTY
51073000401          296106         Hayes GLOUCESTER COUNTY
51073000402          296107  Sarahs Creek GLOUCESTER COUNTY
51135000301          296568  PRECINCT 3-1   NOTTOWAY COUNTY
51137000103         2228518        1 TOWN     ORANGE COUNTY
51137000303         2228477        3 TOWN     ORANGE COUNTY

1 of these have null/empty van_precinct_id:
      GEOID van_precinct_id    PrcnctName     CountyName
51059000700                 FAIRFAX COURT FAIRFAX COUNTY


In [22]:
# The van_precinct_ids from the missing shapes (excluding the null one)
missing_van_ids = ['296106', '296107', '296568', '2228518', '2228477']

# Check if these exist in the turf data
for van_id in missing_van_ids:
    exists_in_turf = van_id in turfed_precincts['van_precinct_id'].values
    print(f"van_precinct_id {van_id} exists in turf data: {exists_in_turf}")

# Also check what's in turf data for these
turf_check = turfed_precincts[turfed_precincts['van_precinct_id'].isin(missing_van_ids)]
print(f"\nFound {len(turf_check)} of these van_precinct_ids in turf data:")
if len(turf_check) > 0:
    print(turf_check[['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region']].to_string(index=False))

van_precinct_id 296106 exists in turf data: False
van_precinct_id 296107 exists in turf data: False
van_precinct_id 296568 exists in turf data: False
van_precinct_id 2228518 exists in turf data: False
van_precinct_id 2228477 exists in turf data: False

Found 0 of these van_precinct_ids in turf data:


In [23]:
merged_gdf.columns

Index(['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region',
       'Current Turf', 'HDs', 'voters', 'supporters', 'precinct_name_doe',
       'GEOID', 'geometry'],
      dtype='object')

In [24]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import json
import os

# Load original data
print("Loading original CSV...")
df = pd.read_csv("output/turfed_precincts_pre_AOK.csv")
print(f"Loaded {len(df):,} precincts")

# Convert WKT to geometries
print("Converting WKT to geometries...")
df['geometry'] = df['geometry'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')

# Simplify geometries IN PLACE - replace the original geometry column
print("Simplifying geometries...")
gdf['geometry'] = gdf['geometry'].simplify(0.0001)

# Calculate centroids for marker-based views
print("Calculating centroids...")
# Suppress the CRS warning - we know it's geographic but centroids are fine for our use
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gdf['centroid_lat'] = gdf.geometry.centroid.y
    gdf['centroid_lon'] = gdf.geometry.centroid.x

# Calculate bounds for each precinct (useful for zooming)
print("Calculating bounds...")
bounds = gdf.geometry.bounds
gdf['min_lon'] = bounds['minx']
gdf['min_lat'] = bounds['miny']
gdf['max_lon'] = bounds['maxx']
gdf['max_lat'] = bounds['maxy']

# Create a version without geometry for faster filtering
df_metrics = gdf.drop(columns=['geometry']).copy()

# Save different versions for different use cases

# 1. Full GeoJSON with simplified geometries (for map display)
print("Saving GeoJSON...")
with open('output/precincts_simplified.geojson', 'w') as f:
    f.write(gdf.to_json())

# 2. Metrics-only CSV (for filtering and stats - MUCH faster to load)
print("Saving metrics CSV...")
df_metrics.to_csv('output/precincts_metrics.csv', index=False)

# 3. Create pre-aggregated summaries by region and turf
print("Creating aggregated summaries...")
summary_by_region = df_metrics.groupby('Current Region').agg({
    'voters': 'sum',
    'supporters': 'sum',
    'van_precinct_id': 'count'
}).rename(columns={'van_precinct_id': 'precinct_count'}).reset_index()

summary_by_turf = df_metrics.groupby(['Current Region', 'Current Turf']).agg({
    'voters': 'sum',
    'supporters': 'sum',
    'van_precinct_id': 'count'
}).rename(columns={'van_precinct_id': 'precinct_count'}).reset_index()

summary_by_region.to_csv('output/summary_by_region.csv', index=False)
summary_by_turf.to_csv('output/summary_by_turf.csv', index=False)

# 4. Optional: Create separate GeoJSON files by region (for lazy loading)
print("Creating region-specific files...")
os.makedirs('output/regions', exist_ok=True)
for region in gdf['Current Region'].unique():
    region_gdf = gdf[gdf['Current Region'] == region]
    safe_filename = region.replace('/', '_').replace(' ', '_')
    with open(f'output/regions/{safe_filename}.geojson', 'w') as f:
        f.write(region_gdf.to_json())
    print(f"  - Saved {region}: {len(region_gdf):,} precincts")

print("\nPreprocessing complete!")
print("\n=== File Size Comparison ===")
import os

# Check original file size
original_size = os.path.getsize("output/turfed_precincts_pre_AOK.csv") / (1024**2)
print(f"Original CSV with WKT: {original_size:.2f} MB")

# Check new files
geojson_size = os.path.getsize("output/precincts_simplified.geojson") / (1024**2)
metrics_size = os.path.getsize("output/precincts_metrics.csv") / (1024**2)

print(f"Simplified GeoJSON: {geojson_size:.2f} MB")
print(f"Metrics CSV (no geometry): {metrics_size:.2f} MB")
print(f"\nGeoJSON size ratio: {geojson_size/original_size:.1%} of original")

# Memory comparison
print("\n=== Memory Usage ===")
print(f"Original dataframe in memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"GeoDataFrame with simplified geometry: {gdf.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nFiles saved to output/ directory")

Loading original CSV...
Loaded 2,535 precincts
Converting WKT to geometries...
Simplifying geometries...
Calculating centroids...
Calculating bounds...
Saving GeoJSON...
Saving metrics CSV...
Creating aggregated summaries...
Creating region-specific files...
  - Saved R01 - Inner Nova: 94 precincts
  - Saved R02 - Fairfax: 202 precincts
  - Saved R03 - Loudoun: 165 precincts
  - Saved R04 - Prince William: 176 precincts
  - Saved R12 - Shenandoah: 263 precincts
  - Saved R05 - Fredericksburg: 154 precincts
  - Saved R13 - Southwest: 482 precincts
  - Saved R06 - North Richmond: 176 precincts
  - Saved R07 - South Richmond: 101 precincts
  - Saved R08 - Newport News: 88 precincts
  - Saved R11 - Southside: 172 precincts
  - Saved R09 - Suffolk: 158 precincts
  - Saved R10 - Virginia Beach / Norfolk: 180 precincts
  - Saved R14 - Charlottesville: 124 precincts

Preprocessing complete!

=== File Size Comparison ===
Original CSV with WKT: 98.70 MB
Simplified GeoJSON: 15.69 MB
Metrics CSV (