## Scrape State Leg election History 

In [2]:
import pandas as pd
import requests 
import re
import numpy as np
from bs4 import BeautifulSoup
import time # To put the system to sleep
import random # for random numbers
import json
import pandasql as psql
import itertools

In [3]:
states = [
'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware',
'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky',
'Louisiana','Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi',
'Missouri','Montana','Nebraska','Nevada','New_Hampshire','New_Jersey','New_Mexico',
'New_York','North_Carolina','North_Dakota','Ohio','Oklahoma','Oregon','Pennsylvania',
'Rhode_Island','South_Carolina','South_Dakota','Tennessee','Texas','Utah','Vermont',
'Virginia','Washington','West_Virginia','Wisconsin','Wyoming']

#urls = ['https://ballotpedia.org/{}_State_Legislature'.format(state) for state in states]

In [4]:
def ballotpedia_state_senate_control_scraper(state):
    '''
    for a given state, scrapes the senate election results tables 
    '''
    url = 'https://ballotpedia.org/{}_State_Legislature'.format(state)
    page = requests.get(url)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        senate_table = soup.find('table', {'class': 'wikitable'}).find_next('table')
        senate_results = []
        for row in senate_table.find_all('tr')[1:]:  # skip the header row
            cols = row.find_all('td')
            if cols:
                senate_results.append([col.text.strip() for col in cols])
        
        senate_rows = senate_table.find_all('tr')

        # Extract headers for Senate
        senate_headers = [re.sub(r'[^\w\s\d]', '',header.text.strip())[:2] for header in senate_rows[0].find_all('th')]

        list_1 =[]
        
        for col in senate_headers[1:]:
            if col[0]=='9':
                list_1.append('19'+col)
            else:
                list_1.append('20'+col)

        first_col = np.where(senate_headers[0]=='Pa','Year','Year')
        first_col_string = re.sub(r'[^\w\s\d]', '',np.array2string(first_col, separator=', ').strip('[]'))
        
        senate_headers = [first_col_string] + list_1

        senate_df = pd.DataFrame(senate_results, columns = senate_headers)  # Adjust column names


        return(senate_df) 
    
    else:
        return({state:page.status_code})

def ballotpedia_state_house_control_scraper(state):
    '''
    for a given state, scrapes the house election results tables 
    '''
    url = 'https://ballotpedia.org/{}_State_Legislature'.format(state)
    page = requests.get(url)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        senate_table = soup.find('table', {'class': 'wikitable'}).find_next('table')

# Step 4: Extract House of Representatives election results
        house_table = senate_table.find_next('table').find_next('table') 
        house_results = []
        for row in house_table.find_all('tr')[1:]:  # skip the header row
            cols = row.find_all('td')
            if cols:
                house_results.append([col.text.strip() for col in cols])
        
        house_rows = house_table.find_all('tr')

        # Extract headers for House
        house_headers = [re.sub(r'[^\w\s\d]', '',header.text.strip())[:2] for header in house_rows[0].find_all('th')]
        
        list_1 =[]
        
        for col in house_headers[1:]:
            if col[0]=='9':
                list_1.append('19'+col)
            else:
                list_1.append('20'+col)
        
        first_col = np.where(house_headers[0]=='Pa','Year','Year')
        first_col_string = re.sub(r'[^\w\s\d]', '',np.array2string(first_col, separator=', ').strip('[]'))
        
        house_headers = [first_col_string] + list_1
        
        house_df = pd.DataFrame(house_results, columns = house_headers)  # Adjust column names

        return(house_df) 

    else:
        return({state:page.status_code})


In [5]:
senate_results={}
for state in states:
    df = ballotpedia_state_senate_control_scraper(state)
    
    if df.shape[0]<=3:
        
       df = df.set_index('Year').T
       df['state']=state
       df['Democrats'] = pd.to_numeric(df['Democrats'].str.replace(r'\D+', '', regex=True))
       df['Republicans'] = pd.to_numeric(df['Republicans'].str.replace(r'\D+', '', regex=True))
       df['sen_control'] = np.where(df['Democrats']>df['Republicans'],'D','R')
       df.reset_index(names='elec_year',inplace=True)
       df['elec_year']=pd.to_numeric(df['elec_year'])
       df['leg_start'] = df['elec_year']+ 1
       df['leg_end'] = df['elec_year'].shift(-1, fill_value=0)
    # select only two party results
       df=df[['state','Democrats','Republicans','sen_control','elec_year','leg_start','leg_end']]

    else: pass 

    senate_results[state]=df
    # Put the system to sleep for a random draw of time (be kind)
    time.sleep(random.uniform(.5,1))
    
    

ChunkedEncodingError: ('Connection broken: IncompleteRead(4505994 bytes read, 695531 more expected)', IncompleteRead(4505994 bytes read, 695531 more expected))

In [66]:
del senate_results['Nebraska']

In [57]:
house_results={}
house_states = [state for state in states if state!='Nebraska']
for state in house_states:
    try:
        df = ballotpedia_state_house_control_scraper(state)

        if df.shape[0]<=3:
           df = df.set_index('Year').T
           df['state']=state
           df['Democrats'] = pd.to_numeric(df['Democrats'].str.replace(r'\D+', '', regex=True))
           df['Republicans'] = pd.to_numeric(df['Republicans'].str.replace(r'\D+', '', regex=True))
           df['house_control'] = np.where(df['Democrats']>df['Republicans'],'D','R')
           df.reset_index(names='elec_year',inplace=True)
           df['elec_year']=pd.to_numeric(df['elec_year'])
           df['leg_start'] = df['elec_year']+ 1
           df['leg_end'] = df['elec_year'].shift(-1, fill_value=0)
           df=df[['state','elec_year','Democrats','Republicans','house_control','leg_start','leg_end']]

        else: pass 
    
        house_results[state]=df
    
    except:
        print(f'An error occurred with state: {state}')
        
    # Put the system to sleep for a random draw of time (be kind)
    time.sleep(random.uniform(.5,1))
    
 

An error occurred with state: Nebraska


In [67]:
# concatenate dictionary values 
all_house_results = pd.concat(house_results.values(), ignore_index=True)
all_senate_results = pd.concat(senate_results.values(), ignore_index=True)

# Export the concatenated DataFrame to a CSV file
all_house_results.to_csv('house_leg_election_results.csv', index=False)
all_senate_results.to_csv('senate_leg_election_results.csv', index=False)


In [6]:
Years = list(range(1999,2020))

In [7]:
# Create the cross product of the two lists
combinations = list(itertools.product(Years, states))

# Create a DataFrame from the combinations
df_base = pd.DataFrame(combinations, columns=['Year', 'state'])


In [9]:
gov_data = pd.read_csv('data/project_political_data - gov_party.csv')

In [15]:
gov_data.year.unique()

array([2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017])

In [10]:
# reloading senate and house leg results to not have to re run loop & to update dataset to change year to 2017 instead of 2019 
all_senate_results = pd.read_csv('data/senate_leg_election_results.csv')
all_house_results = pd.read_csv('data/house_leg_election_results.csv')

In [11]:
query = """
select 

df_base.Year
, df_base.state

, all_senate_results.elec_year as sen_elec_year
, all_senate_results.leg_start as sen_leg_start
, all_senate_results.leg_end as sen_leg_end
, all_senate_results.sen_control

, all_house_results.elec_year as house_elec_year
, all_house_results.leg_start as house_leg_start
, all_house_results.leg_end as house_leg_end
, all_house_results.house_control 

, gov_data.governor_party

from df_base
left join all_senate_results 
    on df_base.state = all_senate_results.state 
    and df_base.Year between all_senate_results.leg_start and all_senate_results.leg_end
left join all_house_results
    on df_base.state = all_house_results.state 
    and df_base.Year between all_house_results.leg_start and all_house_results.leg_end
left join gov_data 
    on df_base.state = gov_data.state 
    and df_base.Year = gov_data.year

group by 1,2,3,4,5,6,7,8,9,10,11

"""

results = psql.sqldf(query,locals())

In [13]:
results.Year.unique()

array([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [16]:
# filtering to 2017 to incorporate a lag indicator of politics 
political_control_data = results[results['Year']==2017].copy()

In [18]:
# creating a boolean variable to indicate whether a state was governed by a gop trifecta 
political_control_data['gop_trifecta'] = np.where(political_control_data['sen_control']+political_control_data['house_control']+political_control_data['governor_party']=='RRR' ,1,0)

In [19]:
political_control_data.reset_index(inplace=True)

In [20]:
# update values for Nebraska since unicameral structure messed with flow 
political_control_data.loc[26,'sen_control']='R'
political_control_data.loc[26,'gop_trifecta']=1

In [21]:
political_control_data.to_csv('data/state_gov_control_2017.csv')