In [5]:
import pandas as pd
import numpy as np

[Scraping a wikipedia table](https://scipython.com/blog/scraping-a-wikipedia-table-with-beautiful-soup/)

## Governor Affiliation

[Wiki link](https://en.wikipedia.org/wiki/List_of_United_States_governors)

In [564]:
import urllib.request

url = "https://en.wikipedia.org/wiki/List_of_United_States_governors"
req = urllib.request.urlopen(url)
article = req.read().decode()

with open('usa_govs.html', 'w') as fo:
    fo.write(article)

In [565]:
columns=['State','Portrait','Governor','Party','Born','Prior public experience','Inauguration','End of term','Past governors']
wiki_gov_scrape = pd.DataFrame(columns=columns)

In [566]:
from bs4 import BeautifulSoup

# Load article, turn into soup and get the <table>s.
article = open('usa_govs.html').read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:9] == columns:
        # this should theoretically break immediately at the first table, but it doesn't.
        break

# Extract the columns we want and write to a semicolon-delimited text file.
# The first table in the tables list represents governors
for tr in tables[0].find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    state,port,gov,party,born,exp,inaug,end_term,past_govs,test = [td.text.strip() for td in tds[:10]]
    d = pd.DataFrame(data={
        'State':[state],
        'Portrait':[port],
        'Governor':[gov],
        'Party':[party],
        'Born':[born],
        'Prior public experience':[exp],
        'Inauguration':[inaug],
        'End of term':[end_term],
        'Past governors':[past_govs],
        'test':[test]
    })
    wiki_gov_scrape = wiki_gov_scrape.append(d)

In [567]:
del wiki_gov_scrape['Portrait']
del wiki_gov_scrape['Party']
del wiki_gov_scrape['test']

In [568]:
wiki_gov_scrape.columns = ['state','governor','party','born','prior_exp','inauguration','end_term']
wiki_gov_scrape = wiki_gov_scrape.reset_index(drop = True)
wiki_gov_scrape['birth_date'] = wiki_gov_scrape['born'].str.extract('(\d{4}-\d{1,2}-\d{1,2})')
wiki_gov_scrape['birth_date'] = pd.to_datetime(wiki_gov_scrape['birth_date'])
wiki_gov_scrape['term_limit'] = wiki_gov_scrape['end_term'].str.contains('term limits')
wiki_gov_scrape['fed_exp'] = wiki_gov_scrape['prior_exp'].str.contains('U.S.')
wiki_gov_scrape['age'] = wiki_gov_scrape['born'].str.extract('age\s(\d\d)')
wiki_gov_scrape['inauguration'] = pd.to_datetime(wiki_gov_scrape['inauguration'])
wiki_gov_scrape['age_of_inaug'] = (wiki_gov_scrape['inauguration'] - wiki_gov_scrape['birth_date'])/365.25
wiki_gov_scrape['retiring'] = wiki_gov_scrape['end_term'].str.contains('retiring')

In [569]:
wiki_gov_scrape['end_term'] = wiki_gov_scrape['end_term'].str.replace('\(term limits\)','')
wiki_gov_scrape['end_term'] = wiki_gov_scrape['end_term'].str.replace('\(retiring\)','')
wiki_gov_scrape['end_term'] = wiki_gov_scrape['end_term'].str.replace(' ','')
wiki_gov_scrape['end_term'] = wiki_gov_scrape['end_term'].str.replace(' ','')
wiki_gov_scrape.loc[31,'end_term'] = 2022 # term limits take effect in state
wiki_gov_scrape.loc[47,'party'] = 'Republican' # identifies as Republican at national level
wiki_gov_scrape.loc[22,'party'] = 'Democratic' # part of Democratic party, but Democratic-Farmer-Labor name at state level

In [570]:
del wiki_gov_scrape['born']

In [571]:
wiki_gov_scrape.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   state         50 non-null     object         
 1   governor      50 non-null     object         
 2   party         50 non-null     object         
 3   prior_exp     50 non-null     object         
 4   inauguration  50 non-null     datetime64[ns] 
 5   end_term      50 non-null     object         
 6   birth_date    50 non-null     datetime64[ns] 
 7   term_limit    50 non-null     bool           
 8   fed_exp       50 non-null     bool           
 9   age           50 non-null     object         
 10  age_of_inaug  50 non-null     timedelta64[ns]
 11  retiring      50 non-null     bool           
dtypes: bool(3), datetime64[ns](2), object(6), timedelta64[ns](1)
memory usage: 3.8+ KB


In [572]:
wiki_gov_scrape = wiki_gov_scrape.astype({
    'end_term':int,
    'age':int
})

In [573]:
wiki_gov_scrape.head()

Unnamed: 0,state,governor,party,prior_exp,inauguration,end_term,birth_date,term_limit,fed_exp,age,age_of_inaug,retiring
0,Alabama,Kay Ivey,Republican,"Lieutenant Governor, Treasurer",2017-04-10,2023,1944-10-15,False,False,75,72 days 11:37:49.404517,False
1,Alaska,Mike Dunleavy,Republican,Alaska Senate,2018-12-03,2022,1961-05-05,False,False,59,57 days 13:54:49.527720,False
2,Arizona,Doug Ducey,Republican,Treasurer,2015-01-05,2023,1964-04-09,True,False,56,50 days 17:46:26.858316,False
3,Arkansas,Asa Hutchinson,Republican,Under Secretary of Homeland Security for Borde...,2015-01-13,2023,1950-12-03,True,True,69,64 days 02:41:38.562628,False
4,California,Gavin Newsom,Democratic,"Lieutenant Governor, Mayor of San Francisco",2019-01-07,2023,1967-10-10,False,False,52,51 days 05:51:52.114989,False


In [574]:
list_diff(wiki_gov_scrape['state'].unique(),statepop['state'].unique())

{'District of Columbia': 'missing from list1',
 'Puerto Rico': 'missing from list1'}

In [575]:
districts = pd.DataFrame(data={
    'state':['District of Columbia', 'Puerto Rico'],
    'governor':['Muriel Bowser', 'Wanda Vázquez Garced'],
    'party':['Democratic','Republican'],
    'prior_exp':['Council of the District of Columbia','Secretary of Justice'],
    'inauguration':[pd.to_datetime('2015-Jan-2'),pd.to_datetime('2019-Aug-7')],
    'end_term':[2023,2021],
    'birth_date':[pd.to_datetime('1972-Aug-2'),pd.to_datetime('1960-July-9')],
    'term_limit':[False,False],
    'fed_exp':[False,False],
    'age':[48,60],
    'retiring':[False,False]
})

In [576]:
districts['age_of_inaug'] = (districts['inauguration'] - districts['birth_date'])/365.25

In [577]:
wiki_gov_scrape = wiki_gov_scrape.append(districts)

In [578]:
list_diff(wiki_gov_scrape['state'].unique(),statepop['state'].unique())

{}

## State Legislature Makeup

[Wiki link](https://en.wikipedia.org/wiki/List_of_United_States_state_legislatures)

In [37]:
import urllib.request

url = "https://en.wikipedia.org/wiki/List_of_United_States_state_legislatures"
req = urllib.request.urlopen(url)
article = req.read().decode()

#with open('usa_legis.html', 'w') as fo:
    #fo.write(article)

In [38]:
columns=['state','executive','legislature_name',
         'low_house_name','low_party_strength','low_term_yrs',
         'up_house_name','up_party_strength','up_term_yrs']
wiki_leg_scrape = pd.DataFrame(columns=columns)

In [39]:
from bs4 import BeautifulSoup

# Load article, turn into soup and get the <table>s.
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:9] == columns:
        # this should theoretically break immediately at the first table, but it doesn't.
        break
# Extract the columns we want and write to a semicolon-delimited text file.
# The first table in the tables list represents governors
for tr in tables[0].find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue    
    state,gov,legis_name,low_name,low_party_str,low_term_yrs,upper_name,upper_party_str,upper_term_yrs = [td.text.strip() for td in tds[:9]]
    d = pd.DataFrame(data={
        'state':[state],
        'executive':[gov],
        'legislature_name':[legis_name],
        'low_house_name':[low_name],
        'low_party_strength':[low_party_str],
        'low_term_yrs':[low_term_yrs],
        'up_house_name':[upper_name],
        'up_party_strength':[upper_party_str],
        'up_term_yrs':[upper_term_yrs]
    })
    wiki_leg_scrape = wiki_leg_scrape.append(d)

In [40]:
wiki_leg_scrape = wiki_leg_scrape.reset_index(drop=True)

In [41]:
import re

In [98]:
def party_finder(cell, crit = 'maj_party'):
    '''
    Extracts party information from a string in the format of "R 15-2"
    
    input
    -----
    cell: str
        String in the format of "R 15-2", with the first letter and number representing majority party
    crit: str
        Statistic to return. Can be one of "maj_party","min_party","maj_part_num","min_part_num"
    '''
    try:
    # regex creates 3 groups. See: https://regexr.com/5aibj
        stat = re.search('([A-Z])\s(\d{1,}).(\d{1,})', cell)
    #######################
        if crit == 'maj_party':
        # first group is always a letter, either R or D
            return stat.group(1)
        elif crit == 'min_party':
        # derive minority from first group
            if stat.group(1) == 'R':
                return 'D'
            elif stat.group(1) == 'D':
                return 'R'
            else:
                return 'I'
        elif crit == 'maj_party_num':
        # second group is always the majority number
            return stat.group(2)
        elif crit == 'min_party_num':
        # third group is always the minority number
            return stat.group(3)
    except AttributeError:
        return ''

In [99]:
for new_col in ['maj_party','maj_party_num','min_party','min_party_num']:
    wiki_leg_scrape[f'low_{new_col}'] = wiki_leg_scrape['low_party_strength'].apply(party_finder,crit=new_col)
    wiki_leg_scrape[f'up_{new_col}'] = wiki_leg_scrape['up_party_strength'].apply(party_finder,crit=new_col)

In [95]:
def vac_finder(cell):
    '''
    Finds the number of vacancies in the state legislature
    '''
    vac_search = re.search('(\d{1,})\svac',cell)
    if vac_search == None:
        return np.nan
    else:
        return vac_search.group(1)

In [100]:
for col in ['up','low']:
    wiki_leg_scrape[f'{col}_vacancies'] = wiki_leg_scrape[f'{col}_party_strength'].apply(vac_finder)

In [101]:
wiki_leg_scrape

Unnamed: 0,state,executive,legislature_name,low_house_name,low_party_strength,low_term_yrs,up_house_name,up_party_strength,up_term_yrs,low_maj_party,low_maj_party_num,low_min_party,low_min_party_num,up_maj_party,up_maj_party_num,up_min_party,up_min_party_num,low_vacancies,up_vacancies
0,Alabama,Governor,Legislature,House of Representatives,"R 75–28, 1 vac",4.0,State Senate,R 27–8,4,R,75.0,D,28.0,R,27,D,8,1.0,
1,Alaska,Governor,Legislature,House of Representatives,"Coalition 23 (15D, 6R, 2 ind)–17R",2.0,Senate,R 13–7,4,,,,,R,13,D,7,,
2,Arizona,Governor,State Legislature,House of Representatives,R 31-29,2.0,Senate,R 17–13,2,R,31.0,D,29.0,R,17,D,13,,
3,Arkansas,Governor,General Assembly,House of Representatives,"R 76–25, 2 vac",2.0,Senate,R 26–9,4,R,76.0,D,25.0,R,26,D,9,2.0,
4,California,Governor,State Legislature[nb 1],State Assembly,"D 61–17, 1 ind, 1 vac",2.0,State Senate,D 29–11,4,D,61.0,R,17.0,D,29,R,11,1.0,
5,Colorado,Governor,General Assembly,House of Representatives,D 41–24,2.0,Senate,D 19–16,4,D,41.0,R,24.0,D,19,R,16,,
6,Connecticut,Governor,General Assembly,House of Representatives,D 91–60,2.0,Senate,D 22–14,2,D,91.0,R,60.0,D,22,R,14,,
7,Delaware,Governor,General Assembly,House of Representatives,D 26–15,2.0,Senate,D 12–9,4,D,26.0,R,15.0,D,12,R,9,,
8,Florida,Governor,Legislature,House of Representatives,"R 72–46, 2 vac",2.0,Senate,R 23–17,4,R,72.0,D,46.0,R,23,D,17,2.0,
9,Georgia,Governor,General Assembly,House of Representatives,R 105–75,2.0,State Senate,R 35–21,2,R,105.0,D,75.0,R,35,D,21,,
