<a href="https://colab.research.google.com/github/rayvoelker/cincy-py-2022-06-30/blob/main/2022_06_39_cincy_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Thursday, June 30, 2022

## CincyPy Reconnect!

<img src="https://raw.githubusercontent.com/rayvoelker/cincy-py-2022-06-30/main/cincy_py.jpg">

Ray Voelker | ray.voelker@gmail.com

### Data Vis

1. Hamiltion County Ohio Board of Elections -- Voter lists:
   - https://votehamiltoncountyohio.gov/campaign-media/voter-lists/

1. U.S. Census Bureau -- The 2020 ZIP Code Tabulation Areas (ZCTAs) (Polygons With All Geocodes)
   - https://www.census.gov/geographies/mapping-files/2020/geo/tiger-line-file.html

In [1]:
# NOTE: pandas, and altair are already included with the base colab image...
# !pip install -U pandas --quiet
# !pip install -U altair --quiet

!pip install -U duckdb --quiet

import duckdb
import pandas as pd
import altair as alt

from google.colab import data_table
# enable the formatter for more functionality with tabular data
data_table.enable_dataframe_formatter()

In [2]:
# get the current Hamilton County registerd voter list ...
# downloaded and compressed from here: https://votehamiltoncountyohio.gov/campaign-media/voter-lists/
!wget --quiet https://github.com/rayvoelker/cincy-py-2022-06-30/raw/main/VoterListExport-20220627-no.tar.xz --output-document=VoterListExport.tar.xz

# casually download ~500 MB of shape data ... 
# we'll get to this later ... 
!wget --quiet https://www2.census.gov/geo/tiger/TIGER2021/ZCTA520/tl_2021_us_zcta520.zip --output-document=tl_2021_us_zcta520.zip

# if the above doesn't work, the file is also in this repo:
# !wget --quiet https://github.com/rayvoelker/cincy-py-2022-06-30/raw/main/tl_2021_us_zcta520.tar.xz --output-document=tl_2021_us_zcta520.tar.xz

# load the voter list data into a DataFrame
df = pd.read_csv(
    "VoterListExport.tar.xz", 
    parse_dates=['RegisteredDate'],
    low_memory=False
)

In [3]:
# take a look at the columns in the voter list dataframe ...
df.columns

Index(['VoterListExport-20220627-no.csv', 'PrecinctNum', 'PrecinctSplit',
       'PrecinctName', 'RegisteredDate', 'FirstName', 'MiddleName', 'LastName',
       'SuffixName', 'MiddleName.1', 'SuffixName.1', 'Phone', 'Status',
       'PartyCode', 'BirthYear', 'AddressPreDirectional', 'AddressNumber',
       'AddressFraction', 'AddressStreet', 'AddressSuffix', 'AddressOther',
       'CityName', 'AddressZip', 'Congress', 'Senate', 'House', 'Judicial',
       'School', 'CountySchool', 'VocationalSchool', 'RegistrationDate',
       'GENERAL_NOV_2022', 'PRIMARY_AUG_2022', 'PRIMARY_MAY_2022',
       'GENERAL_NOV_2021', 'GENERAL_NOV_2020', 'PRIMARY_MARCH_2020',
       'GENERAL_NOV_2019', 'GENERAL_NOV_2018', 'PRIMARY_MAY_2018',
       'GENERAL_NOV_2017'],
      dtype='object')

In [4]:
df.head()



Unnamed: 0,VoterListExport-20220627-no.csv,PrecinctNum,PrecinctSplit,PrecinctName,RegisteredDate,FirstName,MiddleName,LastName,SuffixName,MiddleName.1,...,GENERAL_NOV_2022,PRIMARY_AUG_2022,PRIMARY_MAY_2022,GENERAL_NOV_2021,GENERAL_NOV_2020,PRIMARY_MARCH_2020,GENERAL_NOV_2019,GENERAL_NOV_2018,PRIMARY_MAY_2018,GENERAL_NOV_2017
0,2100436.0,2203.0,0.0,CINCINNATI 22-C,2022-06-08,J,,A JR,,,...,,,,,,,,,,
1,1885774.0,2510.0,0.0,CINCINNATI 25-J,2016-01-28,LHOUCINE,,AACHAQ,,,...,,,,,,,,P-NON,,
2,2009691.0,2510.0,0.0,CINCINNATI 25-J,2019-05-29,OUMAIMA,,AACHAQ,,,...,,,,,,,,,,
3,2087869.0,2506.0,0.0,CINCINNATI 25-F,2021-10-04,LAURRA MICHELLE,,AAGAARD,,,...,,,,,,,,,,
4,2100228.0,8707.0,0.0,SPRINGFIELD G,2022-06-09,ALICEN ROSEMARY TISCH,,AALAEI,,,...,,,,,,,,,,


In [5]:
# rename the column to "VoterID"
df = df.rename(columns={'VoterListExport-20220627-no.csv': 'VoterID'})
# convert the VoterID to integer (it defaulted to numeric since there maybe a null)
df['VoterID'] = df['VoterID'].astype("Int64")

In [6]:
# DuckDB allows us to efficently run SQL on the Pandas DataFrame!

# connect to an in-memory database
con = duckdb.connect()

# https://duckdb.org/2021/05/14/sql-on-pandas.html
result_df = con.execute("""\
  SELECT
    VoterId,
    AddressNumber,
    AddressStreet,
    AddressSuffix,
    CityName,
    AddressZip,
    BirthYear,
    PartyCode,
    PrecinctNum,
    PrecinctName,
    RegisteredDate,
    strftime('%Y', RegisteredDate) as reg_year, 
    Status, 
    PRIMARY_MAY_2022
  FROM
    df
  ORDER BY RANDOM()
  LIMIT 1000
  """
).df()

result_df

Unnamed: 0,VoterID,AddressNumber,AddressStreet,AddressSuffix,CityName,AddressZip,BirthYear,PartyCode,PrecinctNum,PrecinctName,RegisteredDate,reg_year,Status,PRIMARY_MAY_2022
0,1563602,849.0,FOREST ACRES,DR,CINCINNATI,45255.0,1984.0,NON,5631.0,ANDERSON EE,2005-12-09,2005,A,
1,742143,674.0,CEDARHILL,DR,CINCINNATI,45246.0,1961.0,DEM,5208.0,SPRINGDALE H,1989-07-17,1989,A,P-DEM
2,2007061,1.0,MATTHEWS,CT,CINCINNATI,45246.0,1966.0,NON,7002.0,GLENDALE B,2019-04-17,2019,A,
3,1970674,2958.0,WESTRIDGE,AVE,CINCINNATI,45238.0,1986.0,NON,2612.0,CINCINNATI 26-L,2018-04-09,2018,I,
4,1832581,122.0,WESTFIELD,DR,HARRISON,45030.0,1958.0,NON,3402.0,HARRISON B,2014-04-01,2014,I,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2014867,309.0,VINE,ST,CINCINNATI,45202.0,1990.0,NON,602.0,CINCINNATI 6-B,2019-08-27,2019,A,
996,1719093,7004.0,MOUNT VERNON,AVE,CINCINNATI,45227.0,1992.0,NON,8002.0,MARIEMONT B,2010-10-04,2010,I,
997,1901149,5057.0,COAD,DR,CINCINNATI,45237.0,1997.0,NON,701.0,CINCINNATI 7-A,2016-05-11,2016,A,
998,525964,7563.0,WALTS,WAY,CINCINNATI,45247.0,1941.0,NON,5917.0,COLERAIN Q,1989-07-17,1989,A,


In [7]:
# What is the voter status by year of registration?
# Query:
#
result_df = con.execute("""\
  SELECT
  strftime('%Y', RegisteredDate) as 'Registraion Year', 
  Status, 
  count(*) as count_voters
  FROM
  df
  WHERE
  'Registraion Year' IS NOT NULL
  AND 'Status' IS NOT NULL
  GROUP BY 1,2
  HAVING count(*) > 1
  ORDER BY 3 DESC
""").df()

result_df

Unnamed: 0,Registraion Year,Status,count_voters
0,1989,A,103105
1,2020,A,42222
2,2016,A,27754
3,2004,A,22860
4,2018,A,21819
...,...,...,...
71,1986,A,14
72,1967,A,3
73,1970,I,2
74,1985,A,2


In [8]:
# What is the voter status by year of registration?
# Chart:
#
alt.Chart(result_df).mark_bar().encode(
    x=alt.X('count_voters', ), # sort='-y'
    y=alt.Y('Registraion Year', sort='-x'), # sort='-x'
    tooltip=['Registraion Year', 'count_voters', 'Status'],
    color=alt.Color(
        'Status',
        scale=alt.Scale(
            # https://vega.github.io/vega/docs/schemes/
            # scheme='set1'
            scheme='set2'
        )
    )
).properties(
    title=''
).configure_title(
    # fontSize=20,
    anchor='start',
)

In [9]:
# What distribution of voters by age in Hamilton County?
# Query:
#
result_df = con.execute("""\
  SELECT
  cast(BirthYear as INTEGER) as BirthYear,
  lower(PartyCode) as party_code,
  count(*) as count_voters
  FROM
  df
  WHERE
  BirthYear IS NOT NULL
  AND 'Registraion Year' IS NOT NULL
  AND 'Status' IS NOT NULL
  GROUP BY 1, 2
  HAVING count(*) > 1
  ORDER BY 3 DESC
""").df()

result_df

Unnamed: 0,BirthYear,party_code,count_voters
0,1990,non,11626
1,1991,non,11554
2,1989,non,11517
3,1992,non,11515
4,1993,non,11424
...,...,...,...
289,1989,lib,2
290,1995,lib,2
291,1951,lib,2
292,1915,dem,2


In [10]:
# What distribution of voters by age in Hamilton County?
# Chart:
#
alt.Chart(result_df).mark_bar().encode(
    y=alt.Y('count_voters', ), # sort='-y'
    x=alt.X('BirthYear:N', sort='-x'), # sort='-x'
    tooltip=['BirthYear', 'count_voters', 'party_code'],
    color=alt.Color(
        'party_code',
        scale=alt.Scale(
            # https://vega.github.io/vega/docs/schemes/
            # scheme='set1'
            domain=['dem', 'rep', 'non'],
            range=['#1404bd', '#de0100', 'grey']

        )
    )
).properties(
    title=''
).configure_title(
    # fontSize=20,
    anchor='start',
)

In [11]:
# What distribution of voters by age in Hamilton County?
# Chart:
#
alt.Chart(result_df[result_df['party_code'].isin(['rep', 'dem', 'non'])]).mark_bar().encode(
    y=alt.Y('count_voters', ), # sort='-y'
    x=alt.X('BirthYear:N', sort='-x'), # sort='-x'
    tooltip=['BirthYear', 'count_voters', 'party_code'],
    color=alt.Color(
        'party_code',
        scale=alt.Scale(
            # https://vega.github.io/vega/docs/schemes/
            # scheme='set3'
            domain=['dem', 'rep', 'non'],
            range=['#1404bd', '#de0100', 'grey']
        )
    )
).properties(
    title='',
).facet(
    'party_code',
    columns=1
)

In [12]:
# What are the party codes in the data?
result_df = con.execute("""\
  SELECT
  lower(PartyCode) as party_code,
  count(*) as count_voters
  FROM
  df
  WHERE
  PartyCode IS NOT NULL
  GROUP BY 
  1
  ORDER BY 2 DESC
""").df()

result_df

Unnamed: 0,party_code,count_voters
0,non,431157
1,dem,85813
2,rep,72755
3,lib,100
4,u,1
5,con,1


In [13]:
# What is the voter status by PartyCode?
# Query:
#
result_df = con.execute("""\
  SELECT
  -- strftime('%Y', RegisteredDate) as 'Registraion Year',
  lower(PartyCode) as party_code,
  -- CASE PartyCode
  --    WHEN 'non' THEN 'NON'
  --    ELSE PartyCode
  --END as PartyCode,
  -- 'FirstName', 'MiddleName', 'LastName', 'SuffixName',
  -- 'MiddleName.1', 'SuffixName.1', 'Phone', 
  Status, 
  count(*) as count_voters
  FROM
  df
  WHERE
  lower(PartyCode) IN (
    SELECT
    lower(PartyCode) as party_code
    FROM
    df
    WHERE
    PartyCode IS NOT NULL
    GROUP BY 
    1
    HAVING count(*) >= 100
  )
  GROUP BY 
  1,2
""").df()

result_df

Unnamed: 0,party_code,Status,count_voters
0,non,I,109132
1,non,A,321940
2,dem,A,84595
3,rep,A,72456
4,dem,I,1210
5,rep,I,294
6,rep,M,5
7,non,M,85
8,lib,A,98
9,dem,M,8


In [14]:
# What is the voter status by PartyCode?
# Chart:
#
alt.Chart(result_df).mark_bar().encode(
    x=alt.X('count_voters', ),
    y=alt.Y('party_code', sort='-x'),
    tooltip=['party_code', 'count_voters', 'Status'],
    color=alt.Color(
        'Status',
        scale=alt.Scale(
            # https://vega.github.io/vega/docs/schemes/
            scheme='dark2'
        )
    )
).properties(
    title='Voter Status by PartyCode'
).configure_title(
    # fontSize=20,
    anchor='middle',
)

In [15]:
# What percentage of registered voters voted in the 2022 Primary Election?

# Query:
#
result_df = con.execute("""\
  WITH count_data as (
    SELECT
    PRIMARY_MAY_2022,
    count(*) as count_voters
    from
    df
    GROUP BY 1
  )
  SELECT
  PRIMARY_MAY_2022,
  ROUND(
    ((count_voters * 1.0 ) / (SELECT SUM(count_voters) FROM count_data)) * 100.0, 
    2
  ) as pct
  FROM
  count_data
""").df()

result_df

Unnamed: 0,PRIMARY_MAY_2022,pct
0,,83.37
1,P-REP,8.42
2,A-REP,1.09
3,P-DEM,5.41
4,A-DEM,1.56
5,P-NON,0.13
6,A-NON,0.02


In [16]:
# What percentage of registered voters voted in the 2022 Primary Election?

alt.Chart(result_df).mark_bar().encode(
    x=alt.X('pct', ),
    # y=alt.Y('party_code', sort='-x'),
    tooltip=['PRIMARY_MAY_2022', 'pct',],
    color=alt.Color(
        'PRIMARY_MAY_2022',
        scale=alt.Scale(
            # https://vega.github.io/vega/docs/schemes/
            scheme='accent'
        )
    )
).properties(
    title='Percent Voters Voting in PRIMARY_MAY_2022',
    width=600,
    # height=300
).configure_title(
    # fontSize=20,
    anchor='middle',
)

In [17]:
result_df = con.execute("""\
  SELECT
    AddressZip,
    count(PRIMARY_MAY_2022) AS count_voters
  FROM
    df
  WHERE
    PRIMARY_MAY_2022 IS NOT NULL
    AND AddressZip IS NOT NULL
  GROUP BY 1
""").df()

result_df

Unnamed: 0,AddressZip,count_voters
0,45231.0,5046
1,45215.0,4112
2,45237.0,2374
3,45224.0,2787
4,45246.0,1747
5,45209.0,1187
6,45244.0,2716
7,45202.0,1790
8,45241.0,2832
9,45208.0,3621


In [18]:
!pip install -U geopandas --quiet
!pip install -U folium matplotlib mapclassify --quiet
# !pip install -U pyshp
# !pip install -U shapely
# !pip install -U descartes
# !pip install -U osmnx

[K     |████████████████████████████████| 1.0 MB 7.7 MB/s 
[K     |████████████████████████████████| 6.3 MB 28.0 MB/s 
[K     |████████████████████████████████| 16.7 MB 188 kB/s 
[K     |████████████████████████████████| 95 kB 1.0 MB/s 
[K     |████████████████████████████████| 11.2 MB 13.1 MB/s 
[K     |████████████████████████████████| 930 kB 32.6 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.12.1.post1 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
[?25h

In [19]:
import pandas as pd
import geopandas as gpd
import duckdb

# connect to an in-memory database
con = duckdb.connect()

df = pd.read_csv(
    "VoterListExport.tar.xz", 
    parse_dates=['RegisteredDate'],
    low_memory=False
)

gdf_org = gpd.read_file('tl_2021_us_zcta520.zip')

In [20]:
result_df = con.execute("""\
  SELECT
    cast(AddressZip as INTEGER) as AddressZip,
    count(PRIMARY_MAY_2022) AS count_voters
  FROM
    df
  WHERE
    PRIMARY_MAY_2022 IS NOT NULL
    AND AddressZip IS NOT NULL
  GROUP BY 1
  HAVING count(PRIMARY_MAY_2022) >= 5
""").df()


# merge the above dataframe and the dataframe containing our 
# GEOID20
result_df.rename(columns={"AddressZip": "GEOID20"}, inplace=True)
# convert the column to string so we can do our merge
result_df['GEOID20'] = result_df['GEOID20'].astype(str)

gdf = gdf_org.merge(result_df, on='GEOID20', how='inner')

gdf[
    gdf['GEOID20'].isin(
      [str(zip_code) for zip_code in result_df['GEOID20']]
    )
].explore(
    "count_voters",
    cmap="YlGn"
)

In [21]:
# Hamilton County Ohio Voters in the 2020 General Elections

result_df = con.execute("""\
  SELECT
    cast(AddressZip as INTEGER) as AddressZip,
    count(GENERAL_NOV_2020) AS count_voters
  FROM
    df
  WHERE
    GENERAL_NOV_2020 IS NOT NULL
    AND AddressZip IS NOT NULL
  GROUP BY 1
  HAVING count(GENERAL_NOV_2020) >= 5
""").df()


# merge the above dataframe and the dataframe containing our 
# GEOID20
result_df.rename(columns={"AddressZip": "GEOID20"}, inplace=True)
# convert the column to string so we can do our merge
result_df['GEOID20'] = result_df['GEOID20'].astype(str)

gdf = gdf_org.merge(result_df, on='GEOID20', how='inner')

gdf[
    gdf['GEOID20'].isin(
      [str(zip_code) for zip_code in result_df['GEOID20']]
    )
].explore(
    "count_voters",
    cmap="YlGn"
)

In [22]:
result_df = con.execute("""\
  SELECT
  -- PRIMARY_MARCH_2016,
  lower(PartyCode) as PartyCode,
  cast(AddressZip as INTEGER) as AddressZip,
  count(*) AS count_voters
  FROM
  df
  WHERE
  lower(PartyCode) = 'rep'
  AND AddressZip is not null
  GROUP BY 1,2

""").df()

# merge the above dataframe and the dataframe containing our 
# GEOID20
result_df.rename(columns={"AddressZip": "GEOID20"}, inplace=True)
# convert the column to string so we can do our merge
result_df['GEOID20'] = result_df['GEOID20'].astype(str)

gdf = gdf_org.merge(result_df, on='GEOID20', how='inner')

gdf[
    gdf['GEOID20'].isin(
      [str(zip_code) for zip_code in result_df['GEOID20']]
    )
].explore(
    "count_voters",
    cmap="Reds"
)

In [23]:
result_df = con.execute("""\
  SELECT
  -- PRIMARY_MARCH_2016,
  lower(PartyCode) as PartyCode,
  cast(AddressZip as INTEGER) as AddressZip,
  count(*) AS count_voters
  FROM
  df
  WHERE
  lower(PartyCode) = 'dem'
  AND AddressZip is not null
  GROUP BY 1,2

""").df()

# merge the above dataframe and the dataframe containing our 
# GEOID20
result_df.rename(columns={"AddressZip": "GEOID20"}, inplace=True)
# convert the column to string so we can do our merge
result_df['GEOID20'] = result_df['GEOID20'].astype(str)

gdf = gdf_org.merge(result_df, on='GEOID20', how='inner')

gdf[
    gdf['GEOID20'].isin(
      [str(zip_code) for zip_code in result_df['GEOID20']]
    )
].explore(
    "count_voters",
    cmap="Blues"
)