In [76]:
import pdfplumber
import pandas as pd
import re
import os
import geopandas as gpd

import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
precincts = []

In [51]:
for file in os.listdir('precinct'):
    if file.endswith('pdf'):
        curr_file = 'precinct/' + file
        curr_year = int(re.search(r'\d{4}', file).group(0))
        data = []
        with pdfplumber.open(curr_file) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                for line in text.split('\n'):
                    if line.strip():
                        data.append(line)
        new_df = []
        i = 0
        while i < len(data):
            line = data[i]
            if re.match(r"^\d{4}\s[A-Z]+", line):
                curr_district = ' '.join(line.split(' ')[1:])
                i += 2
                if data[i].startswith('REGISTERED VOTERS - TOTAL'):
                    registered_voters = int(data[i].split()[-1])
                    i += 1
                if data[i].startswith('BALLOTS CAST - TOTAL'):
                    ballots_cast = int(data[i].split()[-1])
                    i += 1
                if data[i].startswith('VOTER TURNOUT - TOTAL'):
                    voter_turnout = float(data[i].split()[-1])
                    i += 1
                new_df.append({
                    'district': curr_district,
                    'registered_voters': registered_voters,
                    'ballots_cast': ballots_cast,
                    'voter_turnout': voter_turnout
                })
            i+=1
        new_df = pd.DataFrame(new_df)
        new_df['year'] = curr_year
        new_df['is_midterm'] = curr_year % 4 != 0
        precincts.append(pd.DataFrame(new_df))


In [52]:
for file in os.listdir('precinct'):
    if file.endswith('csv'):
        curr_file = 'precinct/' + file
        curr_df = pd.read_csv(curr_file)
        curr_year = int(file.split('_')[1].split('.')[0])
        curr_df = curr_df.rename(columns={
            'Precinct': 'district',
            'County': 'district',
            'Registered Voters': 'registered_voters',
            'Ballots Cast': 'ballots_cast',
            'Voter Turnout': 'voter_turnout'
        })
        curr_df['year'] = curr_year
        curr_df['district'] = curr_df['district'].str.upper()
        curr_df['voter_turnout'] = curr_df['voter_turnout'].str.replace('%', '').astype(float)
        if curr_year % 4 != 0:
            curr_df['is_midterm'] = True
        else:
            curr_df['is_midterm'] = False
        try:
            precincts.append(curr_df[['district', 'registered_voters', 'ballots_cast', 'voter_turnout', 'year', 'is_midterm']])
        except:
            display(curr_df.head())

precincts_df = pd.concat(precincts).sort_values(by=['year', 'district'])

In [53]:
precincts_df[precincts_df['district'] == 'ALEPPO']

Unnamed: 0,district,registered_voters,ballots_cast,voter_turnout,year,is_midterm
0,ALEPPO,1461,1073,73.44,2008,False
0,ALEPPO,1411,824,58.4,2010,True
0,ALEPPO,1390,984,70.79,2012,False
0,ALEPPO,1370,698,50.95,2014,True
0,ALEPPO,1373,1036,75.46,2016,False
0,ALEPPO,1374,891,64.85,2018,True
0,ALEPPO,1388,1121,80.76,2020,False
0,ALEPPO,1379,987,71.57,2022,True
0,ALEPPO,1363,1128,82.76,2024,False


In [80]:
precincts_df.to_csv('total_precincts.csv')

relevant_districts_2024 = precincts_df[precincts_df['year'] == 2024]['district'].unique()
relevant_districts_2014 = precincts_df[precincts_df['year'] == 2014]['district'].unique()

relevant_districts = set(relevant_districts_2024).intersection(set(relevant_districts_2014))

relevant_df = precincts_df[precincts_df['district'].isin(relevant_districts)]

In [81]:
allegheny_districts = gpd.read_file('voting_districts.geojson')
allegheny_districts['Muni_War_1'] = allegheny_districts['Muni_War_1'].str.upper()
relevant_df = relevant_df.merge(allegheny_districts, left_on='district', right_on='Muni_War_1', how='left')
relevant_df

Unnamed: 0,district,registered_voters,ballots_cast,voter_turnout,year,is_midterm,OBJECTID,NAME,TYPE,LABEL,...,MWD_NOPA_1,OPA_MUNI_1,MWD_PAD_1,Pseud4_12,PseudoNu_5,Muni_War_1,Shape__Area,Shape__Length,RotateMap,geometry
0,ALEPPO,1461,1073,73.44,2008,False,19,ALEPPO,TOWNSHIP,Aleppo Township,...,10101,901,1010001,0001,1,ALEPPO,4.944878e+07,43972.637115,90,"POLYGON ((-80.15974 40.54168, -80.15974 40.541..."
1,ASPINWALL DIST 1,910,644,70.77,2008,False,47,ASPINWALL,BOROUGH,Aspinwall Borough,...,10201,801,1020001,0002,2,ASPINWALL DIST 1,4.794887e+06,10563.280842,0,"POLYGON ((-79.90141 40.4858, -79.90168 40.4858..."
2,ASPINWALL DIST 2,651,448,68.82,2008,False,672,ASPINWALL,BOROUGH,Aspinwall Borough,...,10202,801,1020002,0003,3,ASPINWALL DIST 2,3.365936e+06,7455.357920,0,"POLYGON ((-79.90215 40.49978, -79.90214 40.499..."
3,ASPINWALL DIST 3,882,602,68.25,2008,False,682,ASPINWALL,BOROUGH,Aspinwall Borough,...,10203,801,1020003,0004,4,ASPINWALL DIST 3,2.642739e+06,6554.334509,0,"POLYGON ((-79.89682 40.49301, -79.89685 40.492..."
4,AVALON WARD 1,1144,700,61.19,2008,False,485,AVALON,BOROUGH,Avalon Borough,...,10310,802,1030100,0005,5,AVALON WARD 1,5.197085e+06,12207.108951,0,"POLYGON ((-80.06724 40.50315, -80.06714 40.503..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10162,WILKINSBURG WARD 3 DIST 4,1061,705,66.45,2024,False,798,WILKINSBURG,BOROUGH,Wilkinsburg Borough,...,22834,866,2280304,1318,1318,WILKINSBURG WARD 3 DIST 4,2.179915e+06,6971.000305,0,"POLYGON ((-79.8878 40.44049, -79.88795 40.4404..."
10163,WILKINSBURG WARD 3 DIST 5,1090,656,60.18,2024,False,781,WILKINSBURG,BOROUGH,Wilkinsburg Borough,...,22835,866,2280305,1319,1319,WILKINSBURG WARD 3 DIST 5,4.452737e+06,9573.866657,0,"POLYGON ((-79.89238 40.45026, -79.89204 40.450..."
10164,WILKINSBURG WARD 3 DIST 6,516,269,52.13,2024,False,785,WILKINSBURG,BOROUGH,Wilkinsburg Borough,...,22836,866,2280306,1320,1320,WILKINSBURG WARD 3 DIST 6,2.323231e+06,8257.952197,90,"POLYGON ((-79.88409 40.44915, -79.88407 40.449..."
10165,WILMERDING DIST 1,481,248,51.56,2024,False,894,WILMERDING,BOROUGH,Wilmerding Borough,...,22901,867,2290001,1321,1321,WILMERDING DIST 1,5.853308e+06,14090.529156,90,"POLYGON ((-79.81408 40.4006, -79.81407 40.4006..."


In [85]:
gpd.GeoDataFrame(relevant_df).to_file('relevant_precincts.geojson', driver='GeoJSON')