In [7]:
# Import libraries
import requests
from bs4 import BeautifulSoup as BS
from IPython.core.display import HTML
import pandas as pd
import re

#### 1\. Start by scraping the data from the summary page for Tennessee's 2nd District, which is available at https://www.opensecrets.org/races/summary?cycle=2020&id=TN02&spec=N.
* The data that we want is contained in the "Total Raised and Spent" table.
* Make a DataFrame showing, for each candidate:
    * the candidate's name
    * the candidate's party
    * state
    * district number
    * whether the candidate was an incumbent
    * whether the candidate won the race
    * the total amount raised by that candidate (as a numeric variable)
    * the total amount spent by the candidate (as a numeric variable)

#### 2\. Once you have working code for Tennessee's 2nd District, expand on your code to capture all of Tennessee's districts.</b>

#### 3\. Once you have working code for all of Tennessee's districts, expand on it to capture all states and districts. The number of representatives each state has can be found in a table on this page: https://www.britannica.com/topic/United-States-House-of-Representatives-Seats-by-State-1787120

In [2]:
# Function that checks webpage contents to make sure url is valid
def check_soup(year, state, district):
    
    URL = f'https://www.opensecrets.org/races/candidates?cycle={year}&id={state}{district}&spec=N'
    response = requests.get(URL)
    soup = BS(response.text)
    
    # URL is valid if webpage contains election year in title and at least one table 
    if (year in soup.find('title').text) and (len(soup.findAll('table')) > 0):
        return True
    
    return False

In [3]:
# Function that web scrapes election candidate info
def get_election_info(year, state, district, data = pd.DataFrame()):
	
	# Retrive election webpage based on year, state, and district
	URL = f'https://www.opensecrets.org/races/candidates?cycle={year}&id={state}{district}&spec=N'
	response = requests.get(URL)
	soup = BS(response.text)
	
	# Store webpage elements containing candidate info and finances
	candidates = soup.findAll('strong')
	tables = (table for table in soup.findAll('table'))

	# Iterate over candidates
	for candidate in candidates:

		# Parse name and party from candidate string
		text = candidate.text.replace('\n','').replace('\t','')
		name = re.match('(.+?)(?=\s\((\w)\))', text).group(1)
		party = re.match('(.+?)(?=\s\((\w)\))', text).group(2)

		# Parse incumbent and winner status from candidate string
		qualifiers = [qualifier.text for qualifier in candidate.findAll('i')]
		incumbent = 'Incumbent' in qualifiers
		winner = 'Winner' in qualifiers

		# Parse total raised and spent from table 1
		table_1 = pd.read_html(str(next(tables)))[0]
		total_category = table_1[0].str.strip(':').to_list()
		try: total_amount = table_1[1].str.replace('[$,]', '', regex = True).astype(int).to_list()
		except: total_amount = table_1[1].to_list()

		# Parse contribution types from table 2
		table_2 = pd.read_html(str(next(tables)))[0]
		contribution_category = table_2['Type of Contribution'].str.replace('[^A-Za-z\s\-]', '', regex = True).str.strip().to_list()
		try: contribution_amounts = table_2['Amount'].str.replace('[$,]', '', regex = True).astype(int).to_list()
		except: contribution_amounts = table_2['Amount'].to_list()
		
		# Parse disclosure types from table 3
		table_3 = pd.read_html(str(next(tables)))[0]
		disclosure_category = table_3['Type of Disclosure'].to_list()
		try: disclosure_amount = table_3['Amount'].str.replace('[$,]', '', regex = True).astype(int).to_list()
		except: disclosure_amount = table_3['Amount'].to_list()

		# Concatenate keys together as list
		keys = (['Name', 'Party', 'Year', 'State', 'District', 'Incumbent', 'Winner'] + 
				total_category + contribution_category + disclosure_category)
		
		# Concatenate values together as list
		values = ([name, party, year, state, district, incumbent, winner] + 
				  total_amount + contribution_amounts + disclosure_amount)
		
		# Append zipped key-value pairs to dataframe
		data = data.append(dict(zip(keys, values)), ignore_index = True)
		
	return data

In [4]:
# List of years to iterate over
years = ['2020', '2018', '2016', '2014', '2012', '2010', 
		 '2008', '2006', '2004', '2002', '2000']

# List of states to iterate over
states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 
		  'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
		  'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
		  'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
		  'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# List of districts to iterate over
districts = ['{:02d}'.format(num) for num in list(range(1,54))]

# Column names of dataframe
column_names = ['Name', 
				'Party',
				'Year',
				'State', 
				'District',
				'Incumbent', 
				'Winner', 
				'Raised', 
				'Spent', 
				'Cash on Hand',
				'Individual Contributions',
				'Small Individual Contributions', 
				'Large Individual Contributions', 
				'PAC Contributions', 
				'Candidate self-financing',
				'Other', 
				'Full Disclosure', 
				'Incomplete Disclosure', 
				'No Disclosure']

# Create empty dataframe with column names
data = pd.DataFrame(columns = column_names)

In [5]:
# Iterate over years
for year in years:

    # Iterate over states
    for state in states:
        
        # Get election info for senate seat #1 if available
        if check_soup(year, state, 'S1'):
            data = get_election_info(year, state, 'S1', data)
        
        # Get election info for senate seat #2 if available
        if check_soup(year, state, 'S2'):
            data = get_election_info(year, state, 'S2', data)

        # Iterate over districts    
        for district in districts:   
            
            # Get election info for district if available, otherwise break
            if not check_soup(year, state, district): break
            data = get_election_info(year, state, district, data)

In [6]:
# Write data to csv
data.to_csv('../data/election_finances.csv', index = False)