## Import data from the web - county information for redistricting problem

In [1]:
# pip install beautifulsoup4 --user




In [2]:
# start by importing the pandas library
import pandas as pd

# read html page using beautifulsoup
url = 'https://www2.census.gov/geo/docs/reference/county_adjacency.txt'

# read the html page using beautifulsoup
import requests
from bs4 import BeautifulSoup as bs


# get the html page
# ignore the ssl certificate error
html_page = requests.get(url, verify=False)



In [3]:
# parse the html page
soup = bs(html_page.content, 'html.parser')

soup

"Autauga County, AL"	01001	"Autauga County, AL"	01001
		"Chilton County, AL"	01021
		"Dallas County, AL"	01047
		"Elmore County, AL"	01051
		"Lowndes County, AL"	01085
		"Montgomery County, AL"	01101
"Baldwin County, AL"	01003	"Baldwin County, AL"	01003
		"Clarke County, AL"	01025
		"Escambia County, AL"	01053
		"Mobile County, AL"	01097
		"Monroe County, AL"	01099
		"Washington County, AL"	01129
		"Escambia County, FL"	12033
"Barbour County, AL"	01005	"Barbour County, AL"	01005
		"Bullock County, AL"	01011
		"Dale County, AL"	01045
		"Henry County, AL"	01067
		"Pike County, AL"	01109
		"Russell County, AL"	01113
		"Clay County, GA"	13061
		"Quitman County, GA"	13239
		"Stewart County, GA"	13259
"Bibb County, AL"	01007	"Bibb County, AL"	01007
		"Chilton County, AL"	01021
		"Hale County, AL"	01065
		"Jefferson County, AL"	01073
		"Perry County, AL"	01105
		"Shelby County, AL"	01117
		"Tuscaloosa County, AL"	01125
"Blount County, AL"	01009	"Blount County, AL"	01009
		"Cullman County, AL"

In [4]:
# read line by line soup.text
data = soup.text


# split the data by \n
data = data.split('\n')

# split the data by \t
data = [i.split('\t') for i in data]


# create a dataframe using data
# columns 'parent_county_desc', 'parent_state_county', 'child_state_county_desc', 'child_state_county'
columns = ['parent_county_desc', 'parent_state_county', 'child_state_county_desc', 'child_state_county']
df = pd.DataFrame(data, columns=columns)

In [5]:
df.head(10)

# if the cell is empty, replace it with NaN
df = df.replace('', pd.NA)

# if the cell is NaN, replace it with the value in the previous row
df = df.replace("", None).ffill()

# display the first 10 rows
df.head(20)

# remove " from the data
df = df.replace('"', '', regex=True)

# filter the data for the state of Washington
# starting with 53 in Parent_County_FIPS_Code
df_washington = df[df['parent_state_county'].str.startswith('53')]

# starting with 53 in the Child_County_FIPS_Code
df_washington = df_washington[df_washington['child_state_county'].str.startswith('53')]

# display the first 10 rows
df_washington.head(10)

# count the number of counties in Washington
# use the Child_County_FIPS_Code
# drop duplicates
counties = df_washington['child_state_county'].drop_duplicates()

# count the number of counties
counties.count()

# done :-)
# export the data to a csv file
# under the "001. Data Bases" directory
df_washington.to_csv('001. Data Bases/washington_counties_adjacency.csv', index=False)


https://en.wikipedia.org/wiki/List_of_counties_in_Washington

In [6]:
# Obtain a complete list of counties for the selected state.


In [7]:
# Obtain demographic data relating to the total population and the percentage of the population that is 
# white only in each county.  These data should come from the US Census of Population from 2020 or later. 
# A summary list is provided at https://worldpopulationreview.com/us-counties Links to an external site.  
# If possible, gather data relating to past statewide elections, so you can see proportions of votes for 
# Democratic versus Republican candidates. 

In [8]:
# Note counties that are geographically adjacent to one another: https://www2.census.gov/geo/docs/reference/county_adjacency.txt

In [9]:
# Set partitioning. Use integer programming (set partitioning) to obtain an algorithmic/optimal redistricting. 
# Assign every county in your selected state to exactly one congressional district while striving to meet your 
# objective through maximization or minimization.
#
# This site has a good example: https://coin-or.github.io/pulp/CaseStudies/a_set_partitioning_problem.html

In [10]:
# Population balance. Try to satisfy population balance (one-person-one-vote). That is, congressional districts 
# should have approximately the same population. Consider strategies for assigning more than one representative 
# to counties with high-population centers as long as elections are county-wide. Do not divide counties geographically.

In [11]:
# Compact districts. Try to ensure that congressional districts geographically compact (are composed of counties 
# that are adjacent to one another). Describe constraints or objectives employed to accomplish this goal. Note 
# any difficulties encountered in setting up constraints or objectives.

In [12]:
# Solve the integer programming problem using Python PuLP or AMPL.  Note any difficulties encountered, given 
# the size of the integer programming problem.

In [13]:
# Consider secondary goals of redistricting, such as encouraging equal representation across races. For example, 
# you may try to achieve as much racial balance (percentage white alone versus other races) as possible across all 
# congressional districts. Another secondary goal may be to ensure that the proportions of Democratic versus Republican 
# representatives are approximately equal to the proportions of Democratic and Republican voters in recent statewide elections.