In [7]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import aiohttp
import asyncio
import random
import time
%run ../data/states_districts_update.py

In [8]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
]

In [9]:
async def scrape_candidate_data(state_district, cycle_year):
    headers = {
    'User-Agent': random.choice(user_agents)
    }
    endpoint = f'https://www.opensecrets.org/races/candidates?cycle={cycle_year}&id={state_district}&spec=N'

    timeout = aiohttp.ClientTimeout(total=600)
    connector = aiohttp.TCPConnector(limit=2)
    
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(endpoint) as res:
            if res.status == 200:  # HTTP Status Code 200 means OK
                text = await res.text()
                soup = BeautifulSoup(text, 'html.parser')
            else:
                print(f"Failed to retrieve {endpoint}: {res.status}")
                return []  # Return an empty list or another suitable default value

    pandas_data = []
    bad_districts = []

    try:
        # Locate the main div containing all candidate blocks
        candidates_div = soup.find('div', class_="Members--list")
        
        if candidates_div:
            # Extract candidate blocks
            candidates_blocks = candidates_div.find_all('div', class_='Members--list-item')
            
            for block in candidates_blocks:
                # Extract the name, party, and other details
                candidate_h2_text = block.find('h2').get_text(strip=True)
                pattern = re.compile(r"(.+?) \((R|D|I)\)")
                match = pattern.match(candidate_h2_text)
                name, party = match.groups()
                incumbent = "Incumbent" in candidate_h2_text

                # Check winner status via tag and class directly
                winner = block.find('span', class_='winner') is not None

                # Extract vote percentage by looking directly for the relevant span and class
                vote_pct_tag = block.find('span', class_='Members--vote-pct')
                vote_pct = None
                if vote_pct_tag:
                    vote_pct = vote_pct_tag.get_text(strip=True).strip("()").replace('% of vote', '').strip()

               # Loop through each 'td' with the label names
                    # Find the 'td' element containing the label name
                    raised_label_td = block.find('td', string="Raised:")
                    spent_label_td = block.find('td', string="Spent:")
                    
                    # If the label is found, find the next 'td' sibling (which should contain the dollar value)
                    if raised_label_td or spent_label_td:
                        raised = raised_label_td.find_next_sibling('td').get_text()
                        spent = spent_label_td.find_next_sibling('td').get_text()

                        pandas_data.append({
                            "Year": cycle_year,
                            "State": state_district[:2],
                            "District": state_district[2:],
                            "Name": name,
                            "Party": party,
                            "Incumbent": incumbent,
                            "Winner": winner,
                            "Vote Percentage": vote_pct,
                            "Raised": raised,
                            "Spent": spent
                        })
        else:
            print(f"No candidate data found for {state_district}")
            bad_districts.append(state_district)
            
    except Exception as e:
        print(f"An error occurred while processing {state_district} for {cycle_year}: {str(e)}")
        bad_districts.append(state_district)

    print(state_district + '-' + cycle_year)
    
    return pandas_data, bad_districts



In [10]:
def get_district_codes():
    district_codes = []
    for state in congressional_districts_update:
        for district in congressional_districts_update[state]:
            code = state + district
            district_codes.append(code)
    return district_codes

In [11]:
async def gather_data(district_codes, years):
    all_data = []
    bad_districts = []
    tasks = [scrape_candidate_data(code, year) for year in years for code in district_codes]
   

    results = await asyncio.gather(*tasks, return_exceptions=True)

    for data, bad_district in results:
        all_data.extend(data)
        bad_districts.extend(bad_district)

    df = pd.DataFrame(all_data)

    return df, bad_districts

# Example usage in a Jupyter Notebook cell:

district_codes = get_district_codes() 
print(len(district_codes))
district_codes_1 = district_codes[:100]
district_codes_2 = district_codes[100:200]
district_codes_3 = district_codes[200:300]
district_codes_4 = district_codes[300:]



439


In [12]:
print(len(district_codes_1))
years = ['2018','2020']
df_1, bad_districts_1 = await gather_data(district_codes_1, years)
print(bad_districts_1)
df_1

100
AL04-2018
AL07-2018
AZ08-2018
AL02-2018
CA08-2018
AZ05-2018
AZ01-2018
CA02-2018
AR02-2018
CA09-2018
CA04-2018
AZ03-2018
CA14-2018
AK01-2018
AR01-2018
AL06-2018
AL05-2018
AZ04-2018
AR04-2018
AZ07-2018
AZ02-2018
AR03-2018
AL03-2018
AL01-2018
CA15-2018
AZ09-2018
CA07-2018
CA18-2018
CA24-2018
CA17-2018
CA19-2018
CA06-2018
CA03-2018
AZ06-2018
CA20-2018
An error occurred while processing CA13 for 2018: 'NoneType' object has no attribute 'groups'
CA13-2018
CA05-2018
CA28-2018
CA12-2018
CA25-2018
CA38-2018
CA16-2018
CA11-2018
CA01-2018
CA22-2018
CA10-2018
CA33-2018
CA37-2018
CA26-2018
CA21-2018
CA27-2018
CA29-2018
CA43-2018
An error occurred while processing CA34 for 2018: 'NoneType' object has no attribute 'groups'
CA34-2018
CA36-2018
CO01-2018
CO04-2018
CA45-2018
CA42-2018
CA47-2018
CA31-2018
CA48-2018
CO05-2018
CA50-2018
CA52-2018
An error occurred while processing CO02 for 2018: 'NoneType' object has no attribute 'groups'
CO02-2018
CO07-2018
CA30-2018
CA41-2018
CA23-2018
CA32-2018
CA53

Unnamed: 0,Year,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,2018,AL,01,Bradley Byrne,R,True,True,63.2,"$1,460,041","$831,634"
1,2018,AL,01,Robert Kennedy Jr.,D,False,False,36.8,"$46,845","$46,845"
2,2018,AL,02,Martha Roby,R,True,True,61.4,"$2,559,441","$2,263,209"
3,2018,AL,02,Tabitha Isner,D,False,False,38.4,"$519,145","$519,145"
4,2018,AL,03,Mike D Rogers,R,True,True,63.8,"$1,349,986","$1,347,847"
...,...,...,...,...,...,...,...,...,...,...
361,2020,FL,10,Vennia Francois,R,False,False,36.4,"$335,480","$332,852"
362,2020,FL,11,Daniel Webster,R,True,True,66.7,"$691,261","$586,965"
363,2020,FL,11,Dana Cottrell,D,False,False,33.3,"$62,658","$62,380"
364,2020,FL,12,Gus Bilirakis,R,True,True,62.9,"$1,604,137","$1,548,634"


In [13]:
print(len(district_codes_2))
df_2, bad_districts_2 = await gather_data(district_codes_2, years)
print(bad_districts_2)
df_2

100
FL23-2018
GA03-2018
GA06-2018
GA02-2018
FL20-2018
FL15-2018
FL17-2018
FL16-2018
FL14-2018
FL18-2018
GA05-2018
FL13-2018
GA08-2018
FL21-2018
IL03-2018
FL26-2018
HI02-2018
GA01-2018
FL25-2018
IL05-2018
FL19-2018
FL22-2018
ID02-2018
GA04-2018
GA10-2018
FL24-2018
GA11-2018
GA12-2018
GA07-2018
An error occurred while processing GA09 for 2018: 'NoneType' object has no attribute 'groups'
GA09-2018
IL01-2018
An error occurred while processing ID01 for 2018: 'NoneType' object has no attribute 'groups'
ID01-2018
IL16-2018
An error occurred while processing GA13 for 2018: 'NoneType' object has no attribute 'groups'
GA13-2018
GA14-2018
HI01-2018
IL11-2018
FL27-2018
IN01-2018
IN04-2018
IL02-2018
IL10-2018
IN06-2018
IL09-2018
IL13-2018
IL06-2018
IL14-2018
IL15-2018
KY01-2018
An error occurred while processing IL12 for 2018: 'NoneType' object has no attribute 'groups'
IL12-2018
IL04-2018
IL18-2018
KS04-2018
IA02-2018
IL07-2018
IA01-2018
IL08-2018
IA04-2018
IA03-2018
IN03-2018
IN07-2018
KY02-2018


Unnamed: 0,Year,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,2018,FL,13,Charlie Crist,D,True,True,57.6,"$3,112,230","$1,531,229"
1,2018,FL,13,George Buck,R,False,False,42.4,"$40,828","$40,819"
2,2018,FL,14,Kathy Castor,D,True,True,100.0,"$757,607","$658,645"
3,2018,FL,15,Kristen Carlson,D,False,False,47.0,"$2,067,005","$2,060,050"
4,2018,FL,15,Ross Spano,R,False,True,53.0,"$929,814","$928,169"
...,...,...,...,...,...,...,...,...,...,...
368,2020,MA,08,Stephen Lynch,D,True,True,80.7,"$905,614","$1,134,662"
369,2020,MA,09,Bill Keating,D,True,True,61.3,"$708,680","$652,448"
370,2020,MA,09,Helen Brady,R,False,False,36.3,"$28,233","$23,735"
371,2020,MI,01,John Bergman,R,True,True,61.6,"$1,868,512","$1,733,466"


In [14]:
df_3, bad_districts_3 = await gather_data(district_codes_3, years)
print(bad_districts_3)
df_3

MN01-2018
MN07-2018
MI02-2018
MN03-2018
NE03-2018
An error occurred while processing MI08 for 2018: 'NoneType' object has no attribute 'groups'
MI08-2018
MI14-2018
MN05-2018
MS02-2018
MI06-2018
MN04-2018
MO07-2018
MI09-2018
MI13-2018
MN06-2018
MN02-2018
MO05-2018
An error occurred while processing MI12 for 2018: 'NoneType' object has no attribute 'groups'
MI12-2018
An error occurred while processing MT01 for 2018: 'NoneType' object has no attribute 'groups'
MT01-2018
MO08-2018
NH01-2018
NE02-2018
MI04-2018
MO01-2018
NJ03-2018
NJ08-2018
MI10-2018
MI11-2018
MI05-2018
MI03-2018
NJ01-2018
NV01-2018
MO06-2018
NJ10-2018
MS04-2018
NV02-2018
MI07-2018
NV03-2018
An error occurred while processing NJ07 for 2018: 'NoneType' object has no attribute 'groups'
NJ07-2018
NJ04-2018
MS01-2018
NJ02-2018
MO03-2018
NE01-2018
NY08-2018
An error occurred while processing NM03 for 2018: 'NoneType' object has no attribute 'groups'
NM03-2018
NY01-2018
MO02-2018
NY02-2018
NY03-2018
NJ12-2018
MN08-2018
MS03-2018


Unnamed: 0,Year,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,2018,MI,02,Bill Huizenga,R,True,True,55.3,"$2,136,543","$2,166,958"
1,2018,MI,02,Rob Davidson,D,False,False,43.0,"$1,253,063","$1,248,766"
2,2018,MI,03,Justin Amash,R,True,True,54.4,"$628,562","$764,398"
3,2018,MI,03,Catherine Albro,D,False,False,43.2,"$164,519","$152,791"
4,2018,MI,04,John Moolenaar,R,True,True,62.6,"$1,168,833","$1,087,763"
...,...,...,...,...,...,...,...,...,...,...
354,2020,NC,13,Scott Huffman,D,False,False,31.8,"$193,334","$189,967"
355,2020,ND,01,Kelly Armstrong,R,True,True,69.0,"$1,269,346","$1,080,105"
356,2020,ND,01,Zach Raknerud,D,False,False,27.6,"$28,047","$26,029"
357,2020,OH,01,Steve Chabot,R,True,True,51.8,"$3,177,647","$2,861,464"


In [15]:
df_4, bad_districts_4 = await gather_data(district_codes_4, years)
print(bad_districts_4)
df_4

OR05-2018
PA09-2018
OK01-2018
OH02-2018
OK03-2018
OH06-2018
OR04-2018
OH05-2018
OH11-2018
OH10-2018
OH03-2018
OR03-2018
An error occurred while processing OH14 for 2018: 'NoneType' object has no attribute 'groups'
OH14-2018
OH12-2018
OK05-2018
OH13-2018
PA01-2018
OH04-2018
OH15-2018
PA03-2018
OH07-2018
OK04-2018
PA08-2018
PA04-2018
OH08-2018
An error occurred while processing OK02 for 2018: 'NoneType' object has no attribute 'groups'
OK02-2018
OR01-2018
PA11-2018
An error occurred while processing PA07 for 2018: 'NoneType' object has no attribute 'groups'
PA07-2018
PA17-2018
OH09-2018
RI01-2018
PA06-2018
OH16-2018
PA18-2018
RI02-2018
TX16-2020
PA12-2018
PA02-2018
TN02-2020
An error occurred while processing TN02 for 2018: 'NoneType' object has no attribute 'groups'
TN02-2018
PA05-2018
PA14-2018
PA10-2018
TN06-2018
SC02-2018
PA13-2018
An error occurred while processing PA16 for 2018: 'NoneType' object has no attribute 'groups'
PA16-2018
TN05-2018
TN03-2018
OR02-2018
SC03-2018
An error o

Unnamed: 0,Year,State,District,Name,Party,Incumbent,Winner,Vote Percentage,Raised,Spent
0,2018,OH,02,Brad Wenstrup,R,True,True,57.6,"$1,452,518","$1,596,661"
1,2018,OH,02,Jill Schiller,D,False,False,41.2,"$605,948","$585,051"
2,2018,OH,03,Joyce Beatty,D,True,True,73.6,"$1,087,553","$765,403"
3,2018,OH,04,Jim Jordan,R,True,True,65.3,"$1,239,307","$1,830,320"
4,2018,OH,04,Janet Garrett,D,False,False,34.7,"$697,563","$699,370"
...,...,...,...,...,...,...,...,...,...,...
520,2020,WI,07,Tricia Zunker,D,False,False,39.2,"$1,261,957","$1,232,690"
521,2020,WI,08,Mike Gallagher,R,True,True,64.0,"$3,202,905","$2,841,801"
522,2020,WI,08,Amanda Stuck,D,False,False,36.0,"$416,978","$399,916"
523,2020,WY,01,Liz Cheney,R,True,True,68.6,"$3,003,883","$3,060,167"


In [19]:
# Bad districts whose finance data is hard to scrape
bad_districts = bad_districts_1 + bad_districts_2 + bad_districts_3 + bad_districts_4
print(len(bad_districts))
print(bad_districts)
# This is data for all districts except for bad districts
df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)
df.to_csv('../data/scraped_data_multiyear.csv')

75
['CA13', 'CA34', 'CA40', 'CO02', 'CA29', 'CO03', 'CO05', 'CO06', 'CT03', 'GA09', 'GA13', 'ID01', 'IL12', 'LA01', 'MD03', 'MD05', 'GA05', 'GA08', 'GA13', 'HI02', 'IL06', 'IL07', 'IN09', 'KY02', 'MI08', 'MI12', 'MT01', 'NJ07', 'NM01', 'NM03', 'NY07', 'NY17', 'NY21', 'NY27', 'NC02', 'NC04', 'NC13', 'MI02', 'MN05', 'NV04', 'NY02', 'NY10', 'NY17', 'NY25', 'NY27', 'NC02', 'OH01', 'OH14', 'OK02', 'PA07', 'PA16', 'SC04', 'SD01', 'TN02', 'TX06', 'TX18', 'UT01', 'VA04', 'VA07', 'VA11', 'WV02', 'PR00', 'OH07', 'OH12', 'PA12', 'TX02', 'TX05', 'TX12', 'TX17', 'TX24', 'UT02', 'UT03', 'UT04', 'VA11', 'PR00']
