In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

## Objective: get information from wikipedia table
### https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population
    
Table contains 11 cols:
    - Rank in the fifty states, 2018
    - Rank in states & territories
    - Name
    - Population estimate, July 1, 2019
    - Census population, April 1, 2010
    - Percent increase from 2010-2019
    - Total seats in the house of rep 2013-2023
    - Estimated population per electoral vote, 2018
    - Estimated population per House seat, 2018
    - Census population per House seat, 2010
    - Percent of the total U.S. population, 2018
    
    

##### Definition for constants

In [2]:
NCOL = 11  # number of table colums. Check during table inspection
NLINE = 60 # number of table lines. Check during table inspection

### Scrape wiki page
Look page: https://stackoverflow.com/questions/2010481/how-do-you-get-all-the-rows-from-a-particular-table-using-beautifulsoup

In [3]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population'
browser.visit(url)

In [4]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

<html class="client-js" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of U.S. states by population - Simple English Wikipedia, the free encyclopedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._states_by_population","wgTitle":"List of U.S. states by population","wgCurRevisionId":6644038,"wgRevisionId":6644038,"wgArticleId":48729,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: Archived copy as title","Lists about U.S. states"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","Au

#### find the table

In [5]:
soup = BeautifulSoup(html)
my_table = soup.find('table',class_='wikitable')
my_table

<table class="wikitable sortable" style="width:100%; text-align:center;">
<tbody><tr style="vertical-align: top;">
<th style="vertical-align: middle">Rank in the fifty states, 2018
</th>
<th style="vertical-align: middle">Rank in states &amp; territories
</th>
<th style="vertical-align: middle">Name
</th>
<th style="vertical-align: middle">Population estimate, July 1, 2019<br/><sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup>
</th>
<th style="vertical-align: middle">Census population, April 1, 2010<br/><sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[5]</a></sup>
</th>
<th>Percent increase from 2010-2019<br/><sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[note 1]</a></sup>
</th>
<th style="vertical-align: middle">Total seats in the <a class="mw-redirect" href="/wiki/U.S._House_of_Representatives" title="U.S. House of Representatives">U.S. House of Representatives</a>, 2013–2023
</th>
<th style="vertical-align: middle">Estimated pop

In [6]:
rows = my_table.findChildren(['th', 'tr'])
rows

[<tr style="vertical-align: top;">
 <th style="vertical-align: middle">Rank in the fifty states, 2018
 </th>
 <th style="vertical-align: middle">Rank in states &amp; territories
 </th>
 <th style="vertical-align: middle">Name
 </th>
 <th style="vertical-align: middle">Population estimate, July 1, 2019<br/><sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup>
 </th>
 <th style="vertical-align: middle">Census population, April 1, 2010<br/><sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[5]</a></sup>
 </th>
 <th>Percent increase from 2010-2019<br/><sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[note 1]</a></sup>
 </th>
 <th style="vertical-align: middle">Total seats in the <a class="mw-redirect" href="/wiki/U.S._House_of_Representatives" title="U.S. House of Representatives">U.S. House of Representatives</a>, 2013–2023
 </th>
 <th style="vertical-align: middle">Estimated population per electoral vote, 2018<br/><sup class="reference" id="c

#### Get values of the table

In [7]:
values =[]   # element in the table
for row in rows:
    cells = row.findChildren('td')
    for cell in cells:
        values.append(cell.text)

values

['7000100000000000000♠1\n',
 '7000100000000000000♠1\n',
 '\xa0California\n',
 '39,865,590\n',
 '37,254,523\n',
 '6.98%\n',
 '7001530000000000000♠53\n',
 '719,219\n',
 '746,359\n',
 '702,885\n',
 '12.01%\n',
 '7000200000000000000♠2\n',
 '7000200000000000000♠2\n',
 '\xa0Texas\n',
 '29,206,997\n',
 '25,145,561\n',
 '15.86%\n',
 '7001360000000000000♠36\n',
 '755,312\n',
 '797,273\n',
 '698,503\n',
 '8.80%\n',
 '7000300000000000000♠3\n',
 '7000300000000000000♠3\n',
 '\xa0Florida\n',
 '21,299,325\n',
 '18,801,310\n',
 '13.3%\n',
 '7001270000000000000♠27\n',
 '734,459\n',
 '788,864\n',
 '696,468\n',
 '6.44%\n',
 '7000400000000000000♠4\n',
 '7000400000000000000♠4\n',
 '\xa0New York\n',
 '19,542,209\n',
 '19,378,102\n',
 '0.8%\n',
 '7001270000000000000♠27\n',
 '673,869\n',
 '723,786\n',
 '717,707\n',
 '5.91%\n',
 '7000500000000000000♠5\n',
 '7000500000000000000♠5\n',
 '\xa0Pennsylvania\n',
 '12,807,060\n',
 '12,702,379\n',
 '0.8%\n',
 '7001180000000000000♠18\n',
 '640,353\n',
 '711,503\n',
 '70

In [8]:
# Compute number of table lines.
nline_read = len(values)/NCOL

if (nline_read == NLINE):
    print(f"Recovered all {nline_read} lines. \nPlease continue to next step")
else:  
    print(f"Number of recovered line is not correct.\nPlease check your data and code.")

Recovered all 60.0 lines. 
Please continue to next step


In [9]:
rank_state=[]
rank_all = []
name = []
est_pop_2019=[]
census_pop_2010 = []
perc_increase_2010_2019 =[]
seat_rep_2013=[]
est_pop_elctoral_2018=[]
est_pop_house_2018=[]
census_pop_house_2010 =[]
perc_pop_US_2018=[]


for i in range(int(nline_read)):
    rank_state.append(values[i*NCOL])
    rank_all.append(values[i*NCOL + 1])
    name.append(values[i*NCOL + 2])
    est_pop_2019.append(values[i*NCOL + 3])
    census_pop_2010.append(values[i*NCOL + 4])
    perc_increase_2010_2019.append(values[i*NCOL + 5])
    seat_rep_2013.append(values[i*NCOL + 6])
    est_pop_elctoral_2018.append(values[i*NCOL + 7])
    est_pop_house_2018.append(values[i*NCOL + 8])
    census_pop_house_2010.append(values[i*NCOL + 9]) 
    perc_pop_US_2018.append(values[i*NCOL + 10])

In [10]:
# check reading and allocation of value

for i in range(5):
    print(f"{rank_state[i]},{rank_all[i]}")

7000100000000000000♠1
,7000100000000000000♠1

7000200000000000000♠2
,7000200000000000000♠2

7000300000000000000♠3
,7000300000000000000♠3

7000400000000000000♠4
,7000400000000000000♠4

7000500000000000000♠5
,7000500000000000000♠5



In [11]:
for i in range(5):
    print(f"{name[i]}")

 California

 Texas

 Florida

 New York

 Pennsylvania



In [12]:
for i in range(5):
    print(f"{est_pop_2019[i]} ,{census_pop_2010[i]}, {perc_increase_2010_2019[i]} ")

39,865,590
 ,37,254,523
, 6.98%
 
29,206,997
 ,25,145,561
, 15.86%
 
21,299,325
 ,18,801,310
, 13.3%
 
19,542,209
 ,19,378,102
, 0.8%
 
12,807,060
 ,12,702,379
, 0.8%
 


In [13]:
for i in range(5):
    print(f"{seat_rep_2013[i]} ,{est_pop_elctoral_2018[i]}, {est_pop_house_2018[i]} ,{census_pop_house_2010[i]}")

7001530000000000000♠53
 ,719,219
, 746,359
 ,702,885

7001360000000000000♠36
 ,755,312
, 797,273
 ,698,503

7001270000000000000♠27
 ,734,459
, 788,864
 ,696,468

7001270000000000000♠27
 ,673,869
, 723,786
 ,717,707

7001180000000000000♠18
 ,640,353
, 711,503
 ,705,715



In [14]:
for i in range(5):
    print(f"{perc_pop_US_2018[i]}")

12.01%

8.80%

6.44%

5.91%

3.87%



### Before saving, reformat of data is required

In [15]:
## State name
state =[]
for state_name in name:
    state.append(state_name[1:len(state_name)-1])

for i in range(5):
    print(f"{state[i]}")

California
Texas
Florida
New York
Pennsylvania


In [16]:
# Ranking of population per state
rank_state_formatted = []
rank_all_formatted = []

count =0
for rank in rank_state:
    if (count < 9): 
        rank_state_formatted.append(rank[len(rank)-2:len(rank)-1])
        rank_all_formatted.append(rank[len(rank)-2:len(rank)-1])
        count=count+1
    else:
        rank_state_formatted.append(rank[len(rank)-3:len(rank)-1]) 
        rank_all_formatted.append(rank[len(rank)-3:len(rank)-1])   

print(rank_state_formatted[0:5])


for rank in rank_all:
    if (count < 9): 
        rank_all_formatted.append(rank[len(rank)-2:len(rank)-1])
        count=count+1
    else:
        rank_all_formatted.append(rank[len(rank)-3:len(rank)-1])   

print(rank_all_formatted[0:55])

['1', '2', '3', '4', '5']
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', ' —', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '', '49', '50', '', '', '']


In [17]:
# percentatage of total US population

perc_pop_US_2018_format=[]

for perc in perc_pop_US_2018:
    perc_pop_US_2018_format.append(float(perc[0:len(perc)-2]))

print(perc_pop_US_2018_format[0:5])

[12.01, 8.8, 6.44, 5.91, 3.87]


In [18]:
# percentatage of total US population

est_pop_2019_format=[]

for pop in est_pop_2019:
    tmp = pop[0:len(pop)-1]
    est_pop_2019_format.append((tmp))

print(est_pop_2019_format[0:5])

['39,865,590', '29,206,997', '21,299,325', '19,542,209', '12,807,060']


## Save the table in dataframe (only elements we need)

In [21]:
table_df = pd.DataFrame ()

table_df['state']= state
table_df['rank_state']= rank_state_formatted
table_df['estimated_pop_2019']= est_pop_2019_format
table_df['perc_pop_2018']= (perc_pop_US_2018_format)
table_df.head()

Unnamed: 0,state,rank_state,estimated_pop_2019,perc_pop_2018
0,California,1,39865590,12.01
1,Texas,2,29206997,8.8
2,Florida,3,21299325,6.44
3,New York,4,19542209,5.91
4,Pennsylvania,5,12807060,3.87


In [22]:
table_df['estimated_pop_2019']=table_df['estimated_pop_2019'].str.replace(',','')
table_df

Unnamed: 0,state,rank_state,estimated_pop_2019,perc_pop_2018
0,California,1,39865590,12.01
1,Texas,2,29206997,8.8
2,Florida,3,21299325,6.44
3,New York,4,19542209,5.91
4,Pennsylvania,5,12807060,3.87
5,Illinois,6,12741080,3.85
6,Ohio,7,11689442,3.53
7,Georgia,8,10519475,3.18
8,North Carolina,9,10383620,3.14
9,Michigan,10,9995915,3.02


In [23]:
puerto_rico_idx=table_df[table_df['state'] =='Puerto Rico'].index
table_df.drop(puerto_rico_idx,inplace=True)
table_df

Unnamed: 0,state,rank_state,estimated_pop_2019,perc_pop_2018
0,California,1.0,39865590,12.01
1,Texas,2.0,29206997,8.8
2,Florida,3.0,21299325,6.44
3,New York,4.0,19542209,5.91
4,Pennsylvania,5.0,12807060,3.87
5,Illinois,6.0,12741080,3.85
6,Ohio,7.0,11689442,3.53
7,Georgia,8.0,10519475,3.18
8,North Carolina,9.0,10383620,3.14
9,Michigan,10.0,9995915,3.02


In [24]:
rank_index = table_df[table_df['rank_state'] ==''].index
table_df.drop(rank_index,inplace=True)

table_df

Unnamed: 0,state,rank_state,estimated_pop_2019,perc_pop_2018
0,California,1,39865590,12.01
1,Texas,2,29206997,8.8
2,Florida,3,21299325,6.44
3,New York,4,19542209,5.91
4,Pennsylvania,5,12807060,3.87
5,Illinois,6,12741080,3.85
6,Ohio,7,11689442,3.53
7,Georgia,8,10519475,3.18
8,North Carolina,9,10383620,3.14
9,Michigan,10,9995915,3.02


In [25]:
table_df.to_csv('US_population_2018.csv')