In [13]:
import geopandas as gpd
import pandas as pd

precinct_shapes = gpd.read_file('data/master_precinct_shapes.csv')
turfed_precincts = pd.read_csv('data/turfs_regions_base3.csv', dtype={'van_precinct_id': str})


In [14]:
turfed_precincts.columns

Index(['van_precinct_id', 'van_precinct_name', 'county_name', 'Current Region',
       'Current Turf', 'HDs', 'Zips', 'voters', 'supporters'],
      dtype='object')

In [15]:
precinct_shapes.columns

Index(['PrcnctName', 'van_precinct_name', 'CountyName', 'county_name',
       'CountyFIPS', 'county_fips', 'GEOID', 'van_precinct_id', 'WKT',
       'geometry'],
      dtype='object')

In [16]:
# filter only columns from precinct_shapes that we want
shapes_filtered = precinct_shapes[['van_precinct_id', 'PrcnctName', 'GEOID', 'geometry']].copy()

# rename PrcnctName to precinct_name_doe
shapes_filtered = shapes_filtered.rename(columns={'PrcnctName': 'precinct_name_doe'})

# filter only data we want from turfs_precincts
turfs_filtered = turfed_precincts[['van_precinct_id', 'county_name', 'van_precinct_name', 
                                  'Current Region', 'Current Turf', 'HDs', 'voters', 'supporters']].copy()

# left join the geometries onto the turf data
merged_gdf = turfs_filtered.merge(shapes_filtered, on='van_precinct_id', how='left')

# convert to gdf
merged_gdf = gpd.GeoDataFrame(merged_gdf, crs='EPSG:4326')

print(f"Original turf data: {len(turfed_precincts)} records")
print(f"Original shape data: {len(precinct_shapes)} records") 
print(f"Merged data: {len(merged_gdf)} records")
print(f"Records with geometry: {merged_gdf.geometry.notna().sum()}")
print(f"Records without geometry: {merged_gdf.geometry.isna().sum()}")
print(f"All turf records preserved: {len(merged_gdf) == len(turfed_precincts)}")

print("\nColumns in merged dataset:")
print(merged_gdf.columns.tolist())

print("\nFirst few rows:")
print(merged_gdf.head())

Original turf data: 2532 records
Original shape data: 2541 records
Merged data: 2540 records
Records with geometry: 2540
Records without geometry: 0
All turf records preserved: False

Columns in merged dataset:
['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region', 'Current Turf', 'HDs', 'voters', 'supporters', 'precinct_name_doe', 'GEOID', 'geometry']

First few rows:
  van_precinct_id county_name   van_precinct_name        Current Region  \
0         1071015     Loudoun        314 - Legacy         R03 - Loudoun   
1         1569552    Caroline          602 - Lake  R05 - Fredericksburg   
2         1569568     Fairfax      428 - Pinewood  R04 - Prince William   
3         1569574     Fairfax      909 - Oak Hill         R02 - Fairfax   
4         1569620     Russell  601 - East Lebanon       R13 - Southwest   

          Current Turf  HDs  voters  supporters precinct_name_doe  \
0  R03G - South Riding  026    3226        1890            LEGACY   
1         R05C - HD 

In [17]:
merged_gdf.to_csv('output/turfed_precincts_pre_AOK.csv', index=False)

In [18]:
merged_gdf.columns

Index(['van_precinct_id', 'county_name', 'van_precinct_name', 'Current Region',
       'Current Turf', 'HDs', 'voters', 'supporters', 'precinct_name_doe',
       'GEOID', 'geometry'],
      dtype='object')

In [19]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import json
import os

# Load original data
print("Loading original CSV...")
df = pd.read_csv("output/turfed_precincts_pre_AOK.csv")
print(f"Loaded {len(df):,} precincts")

# Convert WKT to geometries
print("Converting WKT to geometries...")
df['geometry'] = df['geometry'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')

# Simplify geometries IN PLACE - replace the original geometry column
print("Simplifying geometries...")
gdf['geometry'] = gdf['geometry'].simplify(0.0001)

# Calculate centroids for marker-based views
print("Calculating centroids...")
# Suppress the CRS warning - we know it's geographic but centroids are fine for our use
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gdf['centroid_lat'] = gdf.geometry.centroid.y
    gdf['centroid_lon'] = gdf.geometry.centroid.x

# Calculate bounds for each precinct (useful for zooming)
print("Calculating bounds...")
bounds = gdf.geometry.bounds
gdf['min_lon'] = bounds['minx']
gdf['min_lat'] = bounds['miny']
gdf['max_lon'] = bounds['maxx']
gdf['max_lat'] = bounds['maxy']

# Create a version without geometry for faster filtering
df_metrics = gdf.drop(columns=['geometry']).copy()

# Save different versions for different use cases

# 1. Full GeoJSON with simplified geometries (for map display)
print("Saving GeoJSON...")
with open('output/precincts_simplified.geojson', 'w') as f:
    f.write(gdf.to_json())

# 2. Metrics-only CSV (for filtering and stats - MUCH faster to load)
print("Saving metrics CSV...")
df_metrics.to_csv('output/precincts_metrics.csv', index=False)

# 3. Create pre-aggregated summaries by region and turf
print("Creating aggregated summaries...")
summary_by_region = df_metrics.groupby('Current Region').agg({
    'voters': 'sum',
    'supporters': 'sum',
    'van_precinct_id': 'count'
}).rename(columns={'van_precinct_id': 'precinct_count'}).reset_index()

summary_by_turf = df_metrics.groupby(['Current Region', 'Current Turf']).agg({
    'voters': 'sum',
    'supporters': 'sum',
    'van_precinct_id': 'count'
}).rename(columns={'van_precinct_id': 'precinct_count'}).reset_index()

summary_by_region.to_csv('output/summary_by_region.csv', index=False)
summary_by_turf.to_csv('output/summary_by_turf.csv', index=False)

# 4. Optional: Create separate GeoJSON files by region (for lazy loading)
print("Creating region-specific files...")
os.makedirs('output/regions', exist_ok=True)
for region in gdf['Current Region'].unique():
    region_gdf = gdf[gdf['Current Region'] == region]
    safe_filename = region.replace('/', '_').replace(' ', '_')
    with open(f'output/regions/{safe_filename}.geojson', 'w') as f:
        f.write(region_gdf.to_json())
    print(f"  - Saved {region}: {len(region_gdf):,} precincts")

print("\nPreprocessing complete!")
print("\n=== File Size Comparison ===")
import os

# Check original file size
original_size = os.path.getsize("output/turfed_precincts_pre_AOK.csv") / (1024**2)
print(f"Original CSV with WKT: {original_size:.2f} MB")

# Check new files
geojson_size = os.path.getsize("output/precincts_simplified.geojson") / (1024**2)
metrics_size = os.path.getsize("output/precincts_metrics.csv") / (1024**2)

print(f"Simplified GeoJSON: {geojson_size:.2f} MB")
print(f"Metrics CSV (no geometry): {metrics_size:.2f} MB")
print(f"\nGeoJSON size ratio: {geojson_size/original_size:.1%} of original")

# Memory comparison
print("\n=== Memory Usage ===")
print(f"Original dataframe in memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"GeoDataFrame with simplified geometry: {gdf.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nFiles saved to output/ directory")

Loading original CSV...
Loaded 2,540 precincts
Converting WKT to geometries...
Simplifying geometries...
Calculating centroids...
Calculating bounds...
Saving GeoJSON...
Saving metrics CSV...
Creating aggregated summaries...
Creating region-specific files...
  - Saved R03 - Loudoun: 168 precincts
  - Saved R05 - Fredericksburg: 152 precincts
  - Saved R04 - Prince William: 190 precincts
  - Saved R02 - Fairfax: 191 precincts
  - Saved R13 - Southwest: 482 precincts
  - Saved R09 - Suffolk: 167 precincts
  - Saved R07 - South Richmond: 129 precincts
  - Saved R12 - Shenandoah: 261 precincts
  - Saved R11 - Southside: 172 precincts
  - Saved R10 - Virginia Beach / Norfolk: 170 precincts
  - Saved R01 - Inner Nova: 86 precincts
  - Saved R06 - North Richmond: 152 precincts
  - Saved R14 - Charlottesville: 125 precincts
  - Saved R08 - Newport News: 95 precincts

Preprocessing complete!

=== File Size Comparison ===
Original CSV with WKT: 98.78 MB
Simplified GeoJSON: 15.71 MB
Metrics CSV (