In [1]:
import re
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

In [2]:
URL_PREFIX = "https://ballotpedia.org/"
URL_REGEX = r"^(?:https?:\/\/)?(?:ballotpedia.org)?\/"

In [3]:
STATE_URL = URL_PREFIX + "North_Carolina_elections,_2020"
state_page = requests.get(STATE_URL)

In [4]:
state_soup = BeautifulSoup(state_page.content, "html.parser")
offices = state_soup.find("table", {"id": "offices"})

In [11]:
race_urls = defaultdict(set)
RACE_NAMES = {
    "votebox": {"President of the United States", "U.S. Senate", "U.S. House", "Other state executive"},
    "table": {"State Senate", "State House"}
}
for a in offices.find_all("a"):
    url = re.sub(URL_REGEX, "", a["href"])
    name = a.get_text().strip()
    if name in RACE_NAMES["votebox"]:
        race_urls["votebox"].add(url)
    elif name in RACE_NAMES["table"]:
        race_urls["table"].add(url)

print(race_urls)

defaultdict(<class 'set'>, {'votebox': {'North_Carolina_state_executive_official_elections,_2020', 'United_States_House_of_Representatives_elections_in_North_Carolina,_2020', 'United_States_Senate_election_in_North_Carolina,_2020', 'Presidential_election_in_North_Carolina,_2020'}, 'table': {'North_Carolina_House_of_Representatives_elections,_2020', 'North_Carolina_State_Senate_elections,_2020'}})


In [11]:
RACE_URL = URL_PREFIX + "North_Carolina_gubernatorial_election,_2020"
race_page = requests.get(RACE_URL)

In [16]:
race_soup = BeautifulSoup(race_page.content, "html.parser")
voteboxes = race_soup.find_all("div", {"class": "votebox"})

In [22]:
cands = defaultdict(lambda: {"races": []})
for div in voteboxes:
    race = div.find("h5", {"class": "votebox-header-election-type"}).get_text().strip()
    for td in div.find_all("td", {"class": "votebox-results-cell--text"}):
        a = td.find("a")
        url = re.sub(URL_REGEX, "", a["href"])
        name = a.get_text().strip()
        cands[url]["name"] = name
        cands[url]["races"].append(race)

print(cands)

defaultdict(<function <lambda> at 0x7fdf572bdaf0>, {'https://ballotpedia.org/Roy_Cooper': {'races': ['General election for Governor of North Carolina', 'Democratic primary for Governor of North Carolina'], 'name': 'Roy Cooper'}, 'https://ballotpedia.org/Dan_Forest': {'races': ['General election for Governor of North Carolina', 'Republican primary for Governor of North Carolina'], 'name': 'Dan Forest'}, 'https://ballotpedia.org/Steven_DiFiore_II': {'races': ['General election for Governor of North Carolina'], 'name': 'Steven DiFiore  II'}, 'https://ballotpedia.org/Al_Pisano': {'races': ['General election for Governor of North Carolina'], 'name': 'Al Pisano'}, 'https://ballotpedia.org/Ernest_Reeves': {'races': ['Democratic primary for Governor of North Carolina'], 'name': 'Ernest Reeves'}, 'https://ballotpedia.org/Holly_Grange': {'races': ['Republican primary for Governor of North Carolina'], 'name': 'Holly Grange'}})


In [14]:
CAND_URL = URL_PREFIX + "Roy_Cooper"
cand_page = requests.get(CAND_URL)

In [15]:
cand_soup = BeautifulSoup(cand_page.content, "html.parser")
widgets = cand_soup.find_all("div", {"class": "widget-row"})

In [16]:
info = {"name": "Roy Cooper", "races": [], "edu": {}}
party_found = False
edu_found = False
EDU_REGEX = re.compile(r"\s*Education\s*")
for div in widgets:
    if not party_found:
        info["party"] = div["class"][2]  # party is always 3rd class of 1st row
        party_found = True

    if edu_found:
        if "value-only" in div["class"]:
            break  # edu section is over
        else:
            degree = div.find("div", {"class": "widget-key"}).get_text().strip()
            school = div.find("div", {"class": "widget-value"}).get_text().strip()
            info["edu"][degree] = school
    elif div.find("p", string=EDU_REGEX):
        edu_found = True

print(info)

{'name': 'Roy Cooper', 'races': [], 'edu': {"Bachelor's": 'University of North Carolina, Chapel Hill, 1979', 'Law': 'University of North Carolina School of Law, 1982'}, 'party': 'Democratic'}
