# Clean polling place data for analysis

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

---

In [4]:
src = pd.read_csv(
    "output/polling_places_analysis.csv", dtype={"year": str, "fips": str}
)

In [5]:
src.head()

Unnamed: 0,fips,place,state,total_reg_voters,mail_ballots_sent,poll_place_elect_day,poll_place_early,total_votes_cast,year
0,100100000,AUTAUGA COUNTY,AL,43695.0,1329.0,18.0,1.0,27813.0,2020
1,100300000,BALDWIN COUNTY,AL,176668.0,11147.0,50.0,1.0,110214.0,2020
2,100500000,BARBOUR COUNTY,AL,17850.0,726.0,16.0,1.0,10560.0,2020
3,100700000,BIBB COUNTY,AL,15014.0,332.0,8.0,1.0,9630.0,2020
4,100900000,BLOUNT COUNTY,AL,41927.0,1032.0,24.0,1.0,27665.0,2020


---

### Clean up and categorize place types

In [6]:
src["place_type"] = "Other"

In [7]:
src.loc[src["place"].str.contains("COUNTY", case=False), "place_type"] = "County"
src.loc[src["place"].str.contains("PARISH", case=False), "place_type"] = "County"
src.loc[src["place"].str.contains("CITY", case=False), "place_type"] = "City"
src.loc[src["place"].str.contains("TOWN", case=False), "place_type"] = "Town"
src.loc[src["place"].str.contains("VILLAGE", case=False), "place_type"] = "Village"

In [8]:
src.place_type.value_counts()

County     8536
Town       5470
Other      3398
City       2284
Village    1393
Name: place_type, dtype: int64

In [9]:
src["place_clean"] = src["place"].str.title().str.replace(" County", "", regex=False)

In [10]:
src.total_reg_voters.describe()

count     21063
unique    13917
top         0.0
freq        103
Name: total_reg_voters, dtype: object

---

In [11]:
missing_values = [
    "-999999.0",
    "-99.0",
    "-888888.0",
    "-88.0",
    "-999999: Data Not Available",
    "-888888: Not Applicable",
    "",
]
src = src.replace(missing_values, np.NaN)

In [17]:
src.sort_values("total_votes_cast", ascending=False).head()

Unnamed: 0,fips,place,state,total_reg_voters,mail_ballots_sent,poll_place_elect_day,poll_place_early,total_votes_cast,year,place_type,place_clean
177,603700000,LOS ANGELES COUNTY,CA,7122542.0,6067822.0,811.0,811.0,4263059.0,2020,County,Los Angeles
6636,603700000,LOS ANGELES COUNTY,CA,6754224.0,2412019.0,4535.0,6.0,3551506.0,2016,County,Los Angeles
13104,603700000,LOS ANGELES COUNTY,CA,4758437.0,31159.0,4623.0,1.0,3136412.0,2012,County,Los Angeles
76,401300000,MARICOPA COUNTY,AZ,2863040.0,2258790.0,175.0,91.0,2089563.0,2020,County,Maricopa
16513,7200000000,PUERTO RICO,PR,2402941.0,1711.0,7865.0,110.0,1878969.0,2012,Other,Puerto Rico


In [13]:
src[
    [
        "total_reg_voters",
        "mail_ballots_sent",
        "total_votes_cast",
        "poll_place_elect_day",
        "poll_place_early",
    ]
] = src[
    [
        "total_reg_voters",
        "mail_ballots_sent",
        "total_votes_cast",
        "poll_place_elect_day",
        "poll_place_early",
    ]
].astype(
    float
)

In [50]:
src[src["total_reg_voters"] > 100000].isnull().sum()

fips                      0
place                     0
state                     0
total_reg_voters          0
mail_ballots_sent         2
poll_place_elect_day     29
poll_place_early        293
total_votes_cast          0
year                      0
place_type                0
place_clean               0
dtype: int64

In [68]:
df = src[src["total_reg_voters"] > 100].copy()

---

### Polling place rate

Total number of election day polling places divided by the total population and multiplied by 100,000.

In [69]:
df["poll_place_100k"] = round(
    (df["poll_place_elect_day"] / df["total_reg_voters"]) * 100000, 2
)

In [70]:
df.sort_values("poll_place_100k", ascending=False).head(20)

Unnamed: 0,fips,place,state,total_reg_voters,mail_ballots_sent,poll_place_elect_day,poll_place_early,total_votes_cast,year,place_type,place_clean,poll_place_100k
818,1717300000,SHELBY COUNTY,IL,14997.0,2058.0,8609.0,1636.0,12042.0,2020,County,Shelby,57404.81
121,507500000,LAWRENCE COUNTY,AR,8502.0,351.0,4835.0,,5208.0,2020,County,Lawrence,56868.97
813,1716300000,ST. CLAIR COUNTY,IL,4030.0,528.0,1924.0,504.0,120617.0,2020,County,St. Clair,47741.94
131,509500000,MONROE COUNTY,AR,4297.0,174.0,1023.0,-88.0,2817.0,2020,County,Monroe,23807.31
158,514900000,YELL COUNTY,AR,10382.0,390.0,1860.0,4551.0,6815.0,2020,County,Yell,17915.62
3996,4833500000,MITCHELL COUNTY,TX,4524.0,172.0,688.0,1765.0,2603.0,2020,County,Mitchell,15207.78
124,508100000,LITTLE RIVER COUNTY,AR,7201.0,290.0,1051.0,,5200.0,2020,County,Little River,14595.2
102,503700000,CROSS COUNTY,AR,10310.0,396.0,1371.0,5159.0,6986.0,2020,County,Cross,13297.77
10443,4830100000,LOVING COUNTY,TX,112.0,0.0,5.0,1.0,62.0,2016,County,Loving,4464.29
15757,3111900000,MADISON COUNTY,NE,381.0,30.0,16.0,,14306.0,2012,County,Madison,4199.48


---

### Export

In [20]:
df.to_csv("output/polling_places_analysis_clean.csv", index=False)