## Import data from the web - county information for redistricting problem

In [92]:
pip install beautifulsoup4 --user

Note: you may need to restart the kernel to use updated packages.


In [93]:
# start by importing the pandas library
import pandas as pd

# read html page using beautifulsoup
url = 'https://api.census.gov/data/2020/dec/pl?get=P3_001N,P3_003N&for=county:*&in=state:53'

# read the html page using beautifulsoup
import requests
from bs4 import BeautifulSoup as bs


# get the html page
# ignore the ssl certificate error
html_page = requests.get(url, verify=False)



In [94]:
# parse the html page
soup = bs(html_page.content, 'html.parser')

# based on soup.text object create a list of lists
# split each row by \n
# split each column by ,

# create a list of lists
data = []
for row in soup.text.split('\n'):
    data.append(row.split(','))

#remove [, ] from all the elements in the list
data = [[element.replace('[', '').replace(']', '') for element in row] for row in data]

# remove " and empty strings from all the elements in the list
data = [[element.replace('"', '') for element in row if element != ''] for row in data]

# create a dataframe
df = pd.DataFrame(data[1:], columns=data[0])

# print the dataframe
print(df)

    P3_001N  P3_003N state county
0     13630     6621    53    001
1     17729    16099    53    003
2    152644   113123    53    005
3     61405    45417    53    007
4     64588    53896    53    009
5    385835   300790    53    011
6      3277     2896    53    013
7     85596    72143    53    015
8     32199    22276    53    017
9      5831     4352    53    019
10    66302    33500    53    021
11     1799     1661    53    023
12    70900    43781    53    025
13    60530    49109    53    027
14    71175    57361    53    029
15  1813470  1070548    53    033
16   219552   169927    53    035
17    36115    30252    53    037
18    64000    54326    53    041
19     8582     7782    53    043
20    32978    22925    53    047
21    19358    16338    53    049
22   709366   481554    53    053
23    15258    13439    53    055
24     9798     8531    53    059
25   639797   441622    53    061
26    36608    31669    53    065
27   231089   175784    53    067
28    49597   

In [95]:
# convert P3_001N and P3_003N to numeric
df['P3_001N'] = pd.to_numeric(df['P3_001N'])
df['P3_003N'] = pd.to_numeric(df['P3_003N'])

# rename P3_001N and P3_003N to Total_Population and Total_Population_White_Alone
df = df.rename(columns={'P3_001N': 'Total_Population', 'P3_003N': 'Total_Population_White_Alone'})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Total_Population              39 non-null     int64 
 1   Total_Population_White_Alone  39 non-null     int64 
 2   state                         39 non-null     object
 3   county                        39 non-null     object
dtypes: int64(2), object(2)
memory usage: 1.3+ KB
None
       Total_Population  Total_Population_White_Alone
count      3.900000e+01                  3.900000e+01
mean       1.544792e+05                  1.074570e+05
std        3.183633e+05                  1.973257e+05
min        1.799000e+03                  1.661000e+03
25%        1.801250e+04                  1.558950e+04
50%        5.326500e+04                  3.743000e+04
75%        1.271770e+05                  8.867600e+04
max        1.813470e+06                  1.070548e+06


In [99]:
# show the dataframe info
print(df.info())

# show the dataframe describe and all the statistics
print(df.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Total_Population              39 non-null     int64 
 1   Total_Population_White_Alone  39 non-null     int64 
 2   state                         39 non-null     object
 3   county                        39 non-null     object
dtypes: int64(2), object(2)
memory usage: 1.3+ KB
None
        Total_Population  Total_Population_White_Alone state county
count       3.900000e+01                  3.900000e+01    39     39
unique               NaN                           NaN     1     39
top                  NaN                           NaN    53    001
freq                 NaN                           NaN    39      1
mean        1.544792e+05                  1.074570e+05   NaN    NaN
std         3.183633e+05                  1.973257e+05   NaN    NaN
min         1.79900

In [101]:
# combine state and county to create a new column called state_county

df['state_county'] = df['state'] + df['county']

# drop the state and county columns
df = df.drop(columns=['state', 'county'])

# show the dataframe info
print(df.head())

   Total_Population  Total_Population_White_Alone state_county
0             13630                          6621        53001
1             17729                         16099        53003
2            152644                        113123        53005
3             61405                         45417        53007
4             64588                         53896        53009


In [102]:
# save the dataframe to a csv file
# inside '"001. Data Bases" folder
df.to_csv('001. Data Bases/washington_census_data.csv', index=False)

https://en.wikipedia.org/wiki/List_of_counties_in_Washington