In [2]:
import pandas as pd 
import geopandas as gpd
import numpy as np

In [3]:
df_election = pd.read_csv('./data/fips_df.csv')
gdf = gpd.read_file('./data/Congressional_Districts.geojson')  # replace with your geojson file path

# Convert all Timestamp objects to strings
gdf = gdf.map(lambda x: str(x) if isinstance(x, pd.Timestamp) else x)

# ...Data Processing...
# Update '00' districts to '01' in GeoJSON data
gdf['DISTRICT'] = gdf['DISTRICT'].replace('00', '01')
df_election['District'] = df_election['District'].apply(lambda x: str(x).zfill(2))

# Convert the STATEFP20 columns to string in both DataFrames
gdf['STATEFP20'] = gdf['STATEFP20'].astype(str).str.zfill(2)
df_election['STATEFP20'] = df_election['STATEFP20'].astype(str).str.zfill(2)

# Convert STATEFP20 columns to string
gdf['STATEFP20'] = gdf['STATEFP20'].astype(str)
df_election['STATEFP20'] = df_election['STATEFP20'].astype(str)
gdf['DISTRICT'] = gdf['DISTRICT'].astype(str)
df_election['District'] = df_election['District'].astype(str)


# Remove any leading or trailing whitespace
gdf['DISTRICT'] = gdf['DISTRICT'].str.strip()
df_election['District'] = df_election['District'].str.strip()

merged_gdf = gdf.merge(df_election, left_on=['STATEFP20', 'DISTRICT'], right_on=['STATEFP20', 'District'], how='left')



In [4]:
merged_gdf.columns

Index(['OBJECTID', 'STATEFP20', 'GEOID20', 'CD118FP', 'NAMELSAD20', 'LSAD20',
       'CDSESSN', 'MTFCC20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20',
       'INTPTLON20', 'OFFICE_ID', 'BIOGUIDE_ID', 'OFFICE_AUDIT_ID', 'PREFIX',
       'FIRSTNAME', 'MIDDLENAME', 'LASTNAME', 'SUFFIX', 'LISTING_NAME',
       'PHONE', 'WEBSITEURL', 'VACANT', 'CONTACTFORMURL', 'PHOTOURL',
       'FACE_BOOK_URL', 'TWITTER_URL', 'YOUTUBE_URL', 'INSTAGRAM_URL',
       'FLICKR_URL', 'PARTY', 'DISTRICT', 'STATE', 'VACANCY', 'ROOM_NUM',
       'HOB', 'COMMITTEE_ASSIGNMENTS', 'LAST_UPDATED', 'SHAPE_Length',
       'SHAPE_Area', 'geometry', 'Unnamed: 0', 'State', 'District', 'Name',
       'Party', 'Incumbent', 'Winner', 'Vote Percentage', 'Raised', 'Spent',
       'Total_Spent_Per_District', 'Percent_Spent'],
      dtype='object')

In [8]:
merged_gdf = merged_gdf[round(merged_gdf['Percent_Spent'], 2)]

Unnamed: 0,OBJECTID,STATEFP20,GEOID20,CD118FP,NAMELSAD20,LSAD20,CDSESSN,MTFCC20,FUNCSTAT20,ALAND20,...,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent,Total_Spent_Per_District,Percent_Spent
642,355,46,4600,0,Congressional District (at Large),C1,118,G5200,N,196346000000.0,...,1,Dusty Johnson,R,True,1.0,81.8,1693504.0,699356.0,699356.0,100.0


In [23]:

# If Winner == 1.0, we use the actual value, else we use None
merged_gdf['display_name'] = np.where(merged_gdf['Winner'] == 1.0, merged_gdf['Name'], None)
merged_gdf['display_party'] = np.where(merged_gdf['Winner'] == 1.0, merged_gdf['Party'], None)
merged_gdf['display_state'] = np.where(merged_gdf['Winner'] == 1.0, merged_gdf['State'], None)
merged_gdf['display_district'] = np.where(merged_gdf['Winner'] == 1.0, merged_gdf['District'], None)
merged_gdf['display_incumbent'] = np.where(merged_gdf['Winner'] == 1.0, merged_gdf['Incumbent'], None)
merged_gdf['display_percent_spent'] = np.where(merged_gdf['Winner'] == 1.0, merged_gdf['Percent_Spent'], None)

In [24]:

merged_gdf.to_parquet('./data/merged_gdf.parquet', compression='gzip')


Unnamed: 0,OBJECTID,STATEFP20,GEOID20,CD118FP,NAMELSAD20,LSAD20,CDSESSN,MTFCC20,FUNCSTAT20,ALAND20,...,Raised,Spent,Total_Spent_Per_District,Percent_Spent,display_name,display_state,display_party,display_incumbent,display_percent_spent,display_district
0,1,1,101,1,Congressional District 1,C2,118,G5200,N,14843690000.0,...,1971321.0,1859349.0,1938322.0,95.925703,Jerry Carl,AL,R,False,95.925703,1.0
1,1,1,101,1,Congressional District 1,C2,118,G5200,N,14843690000.0,...,80095.0,78973.0,1938322.0,4.074297,,,,,,
2,2,1,102,2,Congressional District 2,C2,118,G5200,N,26956090000.0,...,650807.0,669368.0,725356.0,92.281307,Barry Moore,AL,R,False,92.281307,2.0
3,2,1,102,2,Congressional District 2,C2,118,G5200,N,26956090000.0,...,56050.0,55988.0,725356.0,7.718693,,,,,,
4,3,1,103,3,Congressional District 3,C2,118,G5200,N,21381390000.0,...,1193111.0,1218564.0,1259535.0,96.747133,Mike D Rogers,AL,R,True,96.747133,3.0
5,3,1,103,3,Congressional District 3,C2,118,G5200,N,21381390000.0,...,50273.0,40971.0,1259535.0,3.252867,,,,,,
6,4,1,104,4,Congressional District 4,C2,118,G5200,N,22807400000.0,...,1255076.0,1323812.0,1323812.0,100.0,Robert B Aderholt,AL,R,True,100.0,4.0
7,5,1,105,5,Congressional District 5,C2,118,G5200,N,8697664000.0,...,655365.0,210045.0,210045.0,100.0,Mo Brooks,AL,R,True,100.0,5.0
8,6,1,106,6,Congressional District 6,C2,118,G5200,N,8332760000.0,...,907219.0,909082.0,909082.0,100.0,Gary Palmer,AL,R,True,100.0,6.0
9,7,1,107,7,Congressional District 7,C2,118,G5200,N,28156460000.0,...,2168165.0,1495957.0,1495957.0,100.0,Terri Sewell,AL,D,True,100.0,7.0
