In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import unidecode as uni
import re
import pandas as pd

In [2]:
# Fetch and scrape 115th congress wiki
page = requests.get('https://en.wikipedia.org/wiki/115th_United_States_Congress')
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('table', class_ = 'multicol')
congress115 = re.findall('title\=.*\>(\w.*)\<\/a\> \((\w*)\)', uni.unidecode(str(results[0:2])))

In [3]:
# Fetch and scrape 116th congress wiki
page = requests.get('https://en.wikipedia.org/wiki/116th_United_States_Congress')
soup = BeautifulSoup(page.content.decode('utf-8', 'ignore'), 'html.parser')
results = soup.find_all('table', class_ = 'multicol')
congress116 = re.findall('title\=.*\>(\w.*)\<\/a\> \((\w*)\)', uni.unidecode(str(results[0:2])))

In [4]:
# Create dataframe to store member and party info
congress = pd.DataFrame(data = congress115 + congress116, columns = ['full_name', 'party'])

In [5]:
# Remove name suffixes 
regex = re.compile(r'(| Jr.| III)')
name = congress['full_name'].apply(lambda x: regex.sub('', x))

In [6]:
# Parse first and last names
congress['first_name'] = name.apply(lambda x: x.lower().split(' ')[0])
congress['last_name'] = name.apply(lambda x: x.lower().split(' ')[-1])

In [7]:
# Change 'DFL' to 'D'
congress['party'] = congress['party'].apply(lambda x: x.replace('DFL', 'D'))

In [8]:
# View members not affiliated with 'D' or 'R'
congress.loc[~congress['party'].isin(['D','R'])]

Unnamed: 0,full_name,party,first_name,last_name
39,Angus King,I,angus,king
92,Bernie Sanders,I,bernie,sanders
557,Gregorio Sablan,I,gregorio,sablan
597,Angus King,I,angus,king
649,Bernie Sanders,ID,bernie,sanders
1103,Gregorio Sablan,I,gregorio,sablan


In [9]:
# Change 'I'/'ID' to 'D' since these members caucus with democrats
congress.loc[~congress['party'].isin(['D','R']), 'party'] = 'D'

In [10]:
# Drop duplicates and save as csv
congress = congress.drop_duplicates()
congress.to_csv("../data/congress-wikiscrape.csv", index = False)