In [73]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)

from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import json
from pathlib import Path
import numpy as np
import os
from os import listdir
from os.path import isfile, join

# Data Collection - Web Scraping - Data Parsing 

In [228]:
# Declare global variables
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'District of Columbia']

# National unemployement rate by month from 1948 to 2018
# Source: https://data.bls.gov/pdq/SurveyOutputServlet
national_unemployement_rate = pd.read_csv('data/national_unemployement_1948_2018.csv')

# Get the presidental job approval
# Source: https://www.gallup.com
# https://news.gallup.com/interactives/185273/presidential-job-approval-center.aspx
with open('data/all_presidential_job_approval_gallup.json') as f:
    presidential_approval = json.load(f)
presidential_approval = presidential_approval['AllPresidents']['HistoricalPresident']
presidential_approval_df = pd.DataFrame.from_dict([x['PresidentData'] for x in presidential_approval])
display(presidential_approval_df.head())

Unnamed: 0,DatesinOffice,DaysInOffice,EndDate,FirstTermAverage,JobApprovalHigh,JobApprovalLow,OverallAverage,Party,PresidentName,SecondTermAverage,StartDate
0,2017-Present,658,,-,45.0,35.0,39.5,Rep.,Donald J. Trump,-,2017-01-20
1,2009-2017,2922,2017-01-20,48,67.0,40.0,48.0,Dem.,Barack Obama,47,2009-01-20
2,2001-2009,2922,2009-01-20,62.2,90.0,25.0,49.4,Rep.,George W. Bush,36.5,2001-01-20
3,1993-2001,2922,2001-01-20,49.6,73.0,37.0,55.1,Dem.,Bill Clinton,60.6,1993-01-20
4,1989-1993,1461,1993-01-20,60.9,89.0,29.0,60.9,Rep.,George H. W. Bush,-,1989-01-20


In [388]:
display(national_unemployement_rate.head())

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1948,3.4,3.8,4.0,3.9,3.5,3.6,3.6,3.9,3.8,3.7,3.8,4.0
1,1949,4.3,4.7,5.0,5.3,6.1,6.2,6.7,6.8,6.6,7.9,6.4,6.6
2,1950,6.5,6.4,6.3,5.8,5.5,5.4,5.0,4.5,4.4,4.2,4.2,4.3
3,1951,3.7,3.4,3.4,3.1,3.0,3.2,3.1,3.1,3.3,3.5,3.5,3.1
4,1952,3.2,3.1,2.9,2.9,3.0,3.0,3.2,3.4,3.1,3.0,2.8,2.7


In [4]:
# Get once the necessary pages
presidential_page = requests.get('https://en.wikipedia.org/wiki/United_States_presidential_election')

In [453]:
# List of the US presidents
president_elected_history = pd.read_csv('data/president_elected_history.csv', sep=';')
display(president_elected_history.head())

Unnamed: 0,year,president_elected,president_elected_party,can_be_re_elected
0,1824,John Quincy Adams,DR,1
1,1828,Andrew Jackson,D,1
2,1832,Andrew Jackson,D,0
3,1836,Martin Van Buren,D,1
4,1840,William Henry Harrison,W,1


In [222]:
district_page = requests.get('https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_2016')

In [231]:
def extract_district_list(page):
    district_list = []

    if page.status_code == 200:
        soup = BeautifulSoup(page.text, "html.parser")
        districts = soup.find_all('a', href=re.compile(r'(.*\/wiki\/.* )|(.*_congressional_district)'))
        
        for district in districts:
            # print(district.string)
            if any(substring in district.string for substring in states) \
            and district.string not in district_list \
            and "'s" not in district.string \
            and "12th" not in district.string \
            and '1st' not in district.string:
                district_state = ''
                # Get the corresponding state
                for state in states:
                    if state in district.string:
                        district_state = state
                
                district_list.append({
                    'name': district.string,
                    'page_url': 'https://en.wikipedia.org' + district['href'],
                    'state': district_state
                })
    
    return district_list

districts = extract_district_list(district_page)
districts_df = pd.DataFrame(districts)

In [436]:
"""
Get the national level factors
Source: https://en.wikipedia.org/wiki/United_States_presidential_election
"""

# From a tag, extract the number of seats
def extract_seats(tag):
    if tag.findAll('b'):
        d_seats = tag.b.extract().string
    elif tag.sup and tag.sup.decompose():
        d_seats = tag.sup.decompose()
    elif tag.string is None:
        d_seats = tag.text
    else:
        d_seats = tag.string
    return int(d_seats)

def extract_seats_change(tag):
    if tag.sup:
        d_seats_change = tag.text.split('[', 1)[0]
    else:
        d_seats_change = tag.text
    return int(d_seats_change.replace('–', '-'))

# Get the house election years
def extract_house_elections_history():
    house_elections_history = []

    # If the file doesn't exist, get the data from the webpage and store the content to a new file
    filename = 'data/list_of_house_elections_page.html'
    if Path(filename).is_file():
        with open(filename) as my_file:
            list_of_house_elections_page = my_file.read()
    else:
        print('no file')
        list_of_house_elections_page = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_House_of_Representatives_elections,_1856%E2%80%93present')
        with open(filename,'w') as outfile:
            outfile.write(list_of_house_elections_page.text)

    soup = BeautifulSoup(list_of_house_elections_page, "html.parser")

    # Find the election years
    data = []
    elections_pages = []
    for t in soup.find_all('a', title=lambda x: x and 'United States House of Representatives elections,' in x):
        if len(t.string) == 4:
            elections_pages.append({
                'year': int(t.string),
                'url':'https://en.wikipedia.org'+t.attrs['href']
            })
            year = int(t.string)+2

            cols = t.parent.parent.find_all('td')

            # Get the number of Democrat seats
            d_seats = extract_seats(cols[1])

            # Get the change in the number of Democrat seats
            d_seats_change = extract_seats_change(cols[2])

            # Get the number of Republican seats
            r_seats = extract_seats(cols[3])
            
            # Get the change in the number of Republican seats
            r_seats_change_by_year = extract_seats_change(cols[4])
            
            #print(1 if year in presidential_years else 0)

            idx = (np.abs(president_elected_history['year'].values-year+1)).argmin()
            president_can_be_re_elected = president_elected_history['can_be_re_elected'].loc[[idx]].values[0]
            president_party = president_elected_history['president_elected_party'].loc[[idx]].values[0]

            # Look for president overall job approval average
            president_name = president_elected_history['president_elected'].loc[[idx]].values[0]
            president_overall_avg_job_approval = presidential_approval_df.loc[presidential_approval_df['PresidentName'] == president_name]['OverallAverage']
            president_overall_avg_job_approval = float(president_overall_avg_job_approval.values[0])/100 if president_overall_avg_job_approval.values.size else None
            
            # Get the national unemployement rate for November
            oct_unemployement_rate = national_unemployement_rate.loc[national_unemployement_rate['Year'] == year]['Oct']
            
            oct_unemployement_rate = oct_unemployement_rate.values[0] if oct_unemployement_rate.values.size else None
            
            data.append({
                'year': year,
                'is_presidential_year': 1 if year in president_elected_history['year'].unique() else 0,
                'president_party': president_party,
                'president_can_be_re_elected': president_can_be_re_elected,
                'president_overall_avg_job_approval': president_overall_avg_job_approval,
                'oct_unemployement_rate': oct_unemployement_rate,
                'last_democrat_seats': d_seats,
                'last_republican seats': r_seats,
                'last_house_majority': 'R' if d_seats < r_seats else 'D'
            })

    return data, elections_pages

data, house_elections_pages = extract_house_elections_history()
data_df = pd.DataFrame(data)
national_level_factors = data_df[[
    'year', 
    'is_presidential_year', 
    'president_party', 
    'president_can_be_re_elected', 
    'president_overall_avg_job_approval', 
    'oct_unemployement_rate',
    'last_democrat_seats', 
    'last_republican seats', 
    'last_house_majority']]
display(national_level_factors.sort_values('year', ascending=False).head())

Unnamed: 0,year,is_presidential_year,president_party,president_can_be_re_elected,president_overall_avg_job_approval,oct_unemployement_rate,last_democrat_seats,last_republican seats,last_house_majority
80,2018,0,R,1,0.395,3.7,194,241,R
79,2016,1,R,1,0.395,4.9,188,247,R
78,2014,0,D,0,0.48,5.7,201,234,R
77,2012,1,D,0,0.48,7.8,193,242,R
76,2010,0,D,1,0.48,9.4,257,178,D


In [428]:
"""
Get the state level factors
"""

# Historical presidential election results by state
# Source: https://en.wikipedia.org/wiki/List_of_United_States_presidential_election_results_by_state
election_results_df = pd.read_csv('data/presidential_election_results_by_state.csv')
election_results_df.head()

Unnamed: 0,State,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,1789,,,,,,,GW,GW,,...,,,,,,GW,,,,
1,1792,,,,,,,GW,GW,,...,,,,,GW,GW,,,,
2,1796,,,,,,,F,F,,...,,DR,,,F,DR,,,,
3,1800,,,,,,,F,F,,...,,DR,,,F,DR,,,,
4,1804,,,,,,,F,F,,...,,DR,,,DR,DR,,,,


In [232]:
"""
Get the district level factor
"""
# Get the district pages if they have not been stored on disk yet
for district in districts:
    filename = 'data/district_pages/{}.html'.format(district['name'])
    
    # Check if the page has been stored on disk
    if Path(filename).is_file() is False:
        print(district['name'], '- No page')
        # Get the page
        result = requests.get(district['page_url'])
        with open(filename,'w') as outfile:
            outfile.write(result.text)
        time.sleep(2)

undesirable_chars = ['\*', '%', '\(incumbent\)', '\(inc.\)', '\(write-in\)']
        
def parse_district_house_results(filename, district, state):
    district_house_results = []
    with open(filename) as my_file:
        soup = BeautifulSoup(my_file.read(), "html.parser")
        #print(soup)
        
        # Find the election results tables
        caption = soup.find_all('caption')
        elems = []
        for capt in caption:
            x = capt.get_text()
            if ('United States House of Representatives elections,' in x or
                'congressional district election' in x or
                'US House election, ' in x or
                'Congressional District House Election'
            ):
                elems.append(capt)

        #print(len(elems))

        for capt in elems:
            # Find the date
            match = re.match(r'.*([1-2][0-9]{3})', capt.text)
            if match is None:
                continue

            # Then it found a match!
            year = int(match.group(1))
            #print(year)

            # Get the result table itself
            table = capt.find_parent('table')
            table_body = table.find('tbody')
            rows = table_body.find_all('tr')

            for row in rows:
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                cols = [ele for ele in cols if ele] # Get rid of empty values

                if len(cols) and cols[0] in ['Republican', 'Democratic']:
                    print(cols)
                    
                    percent = np.NaN
                    if len(cols) > 3 and cols[3] != 'N/A':
                        percent = float(re.sub("|".join(undesirable_chars), "", cols[3]))/100
                    
                    votes = np.NaN
                    if len(cols) > 2 and cols[2] == 'N/A':
                        votes = np.NaN
                    elif len(cols) > 2 and '%' not in cols[2] and cols[2] != '100.00':
                        votes = int(cols[2].replace(',', '').replace('.', ''))
                    elif len(cols) > 2 and ('%' in cols[2] or cols[2] == '100.00'):
                        percent = float(re.sub("|".join(undesirable_chars), "", cols[2]))/100
                    
                    district_house_results.append({
                        'year': year,
                        'candidate_party': 'R' if cols[0] == "Republican" else 'D',
                        'candidate_name': re.sub("|".join(undesirable_chars), "", cols[1]),
                        'votes': votes,
                        'percent': percent
                    })
                    
    return pd.DataFrame(district_house_results)
    
def get_district_level_factors(district):
    state = districts_df.loc[districts_df['name'] == district]['state'].values[0]
    dist_level_factors = []
    # Get the page of the district
    html_filename = 'data/district_pages/{}.html'.format(district)
    json_filename = 'data/district_pages/{}.json'.format(district)
    
    # If there is no already formated data, get them from the corresponding Wikipedia page
    if Path(json_filename).is_file() is False:
        district_house_results = parse_district_house_results(html_filename, district, state)
        display(district_house_results)
    else:
        print('get from json')
        district_house_results = pd.read_json(json_filename)
        display(district_house_results)
        
    # Now, for each year
    for year in district_house_results['year'].unique():
        # If there is more than 1 candidate this year
        # Get the current year
        if len(district_house_results.loc[district_house_results['year'] == year]) > 1:
            curr_year_idx = district_house_results.loc[district_house_results['year'] == year]['votes'].idxmax()
            curr_year = district_house_results.loc[[curr_year_idx]]
        else:
            curr_year = district_house_results        
        
        # Get previous year
        prev_year = district_house_results.loc[district_house_results['year'] == year-2]

        if prev_year.empty is False:
            # If there is more than 1 candidate the previous year
            if len(prev_year) > 1:
                prev_year_winner_idx = prev_year['votes'].idxmax()
                prev_year_winner = prev_year.loc[[prev_year_winner_idx]]
            else:
                prev_year_winner = prev_year

            # Get the incumbent name
            incumbent = prev_year_winner['candidate_name'].values[0]

            # Get the elections previously won by the incumbent
            incumbent_history = district_house_results.loc[
                (district_house_results['candidate_name'] == incumbent) &
                (district_house_results['year'] < year)
            ]

            incubent_first_elected_idx = incumbent_history['year'].idxmin()
            incubent_first_elected = incumbent_history.loc[[incubent_first_elected_idx]]['year'].values[0]
            incubent_is_candidate = curr_year.loc[curr_year['candidate_name'] == incumbent].empty

            dist_data = {
                'year': year,
                'state': state,
                'district': district,
                'incumbent': incumbent,
                'incumbent_party': 'R' if prev_year_winner['candidate_party'].values[0] == "Republican" else 'D',
                'incumbent_count_victories': len(incumbent_history),
                'incumbent_first_elected': incubent_first_elected,
                'incumbent_running_re_election': 0 if incubent_is_candidate else 1,
                'candidate_elected_party': curr_year['candidate_party'].values[0]
            }

            dist_level_factors.append(dist_data)
        else:
            print('yop')
            dist_data = {
                'year': year,
                'state': state,
                'district': district,
                'incumbent': np.NaN,
                'incumbent_party': np.NaN,
                'incumbent_count_victories': np.NaN,
                'incumbent_first_elected': np.NaN,
                'incumbent_running_re_election': np.NaN,
                'candidate_elected_party': curr_year['candidate_party'].values[0]
            }
            dist_level_factors.append(dist_data)

    return dist_level_factors

#for district in ['Alabama 1', 'Alabama 2']:
for district in ['Arkansas 1']:
    district_level_factors = get_district_level_factors(district)
    display(pd.DataFrame(district_level_factors).sort_values('year', ascending=True))
    #display(district_level_factors)

['Democratic', 'Robert Marion Berry*', '129,701', '67%']
['Republican', 'Tommy F. Robinson', '64,357', '33%']
['Democratic', 'Robert Marion Berry*', '162,388', '67%']
['Republican', 'Vernon Humphrey', '81,556', '33%']
['Democratic', 'Robert Marion Berry*', '127,577', '69%']
['Republican', 'Mickey Stumbaugh', '56,611', '31%']
['Democratic', 'Robert Marion Berry*', '124,304', '100%']
['Republican', 'Rick Crawford', '93,224', '52%']
['Democratic', 'Chad Causey', '78,267', '43%']
['Republican', 'Rick Crawford*', '138,800', '56%']
['Democratic', 'Scott Ellington', '96,601', '39%']
['Republican', 'Rick Crawford*', '124,139', '63%']
['Democratic', 'Jackie McPherson', '63,555', '32%']
['Republican', 'Rick Crawford*', '183,866', '76.28%']


Unnamed: 0,candidate_name,candidate_party,percent,votes,year
0,Robert Marion Berry,D,0.67,129701,2002
1,Tommy F. Robinson,R,0.33,64357,2002
2,Robert Marion Berry,D,0.67,162388,2004
3,Vernon Humphrey,R,0.33,81556,2004
4,Robert Marion Berry,D,0.69,127577,2006
5,Mickey Stumbaugh,R,0.31,56611,2006
6,Robert Marion Berry,D,1.0,124304,2008
7,Rick Crawford,R,0.52,93224,2010
8,Chad Causey,D,0.43,78267,2010
9,Rick Crawford,R,0.56,138800,2012


yop


Unnamed: 0,candidate_elected_party,district,incumbent,incumbent_count_victories,incumbent_first_elected,incumbent_party,incumbent_running_re_election,state,year
0,D,Arkansas 1,,,,,,Arkansas,2002
1,D,Arkansas 1,Robert Marion Berry,1.0,2002.0,D,1.0,Arkansas,2004
2,D,Arkansas 1,Robert Marion Berry,2.0,2002.0,D,1.0,Arkansas,2006
3,D,Arkansas 1,Robert Marion Berry,3.0,2002.0,D,1.0,Arkansas,2008
4,R,Arkansas 1,Robert Marion Berry,4.0,2002.0,D,0.0,Arkansas,2010
5,R,Arkansas 1,Rick Crawford,1.0,2010.0,D,1.0,Arkansas,2012
6,R,Arkansas 1,Rick Crawford,2.0,2010.0,D,1.0,Arkansas,2014
7,D,Arkansas 1,Rick Crawford,3.0,2010.0,D,1.0,Arkansas,2016


## Get the House and Senate election result pages for all the available years on ballotpedia.org

In [310]:
def check_or_save_page(filename, url):
    # Check if the page has been stored on disk
    if Path(filename).is_file() is False:
        #print('No page')
        # Get the page
        result = requests.get(url)
        with open(filename,'w') as outfile:
            outfile.write(result.text)
        time.sleep(2)
    #else:
        #print('We got it')
        
    with open(filename) as my_file:
        soup = BeautifulSoup(my_file.read(), "html.parser")
        
    return soup

def get_house_senate_state_list():
    house_state_list = []
    senate_state_list = []
    filename = 'data/ballotpedia/house_state_list_src.html'
    url = 'https://ballotpedia.org/U.S._House_battlegrounds,_2018'
    
    # Check if the page has been stored on disk
    soup = check_or_save_page(filename, url)
    
    # Find the list of the U.S. Senate Elections by State (2018) pages
    table = soup.find('table', { 'class': 'infobox' })
    for link in table.find_all('a', href=lambda x: x and '/United_States_Senate_election_in_' in x):
        senate_state_list.append({
            'state': link.text,
            'url': 'https://ballotpedia.org{}'.format(link['href'])
        })

    # Find the list of the U.S. House Elections by State (2018) pages
    table = soup.find('table', { 'class': 'infobox' })
    for link in table.find_all('a', href=lambda x: x and (
        '/United_States_House_of_Representatives_election_in_' in x or
        '/United_States_House_of_Representatives_elections_in_' in x
    )):
        house_state_list.append({
            'state': link.text,
            'url': 'https://ballotpedia.org{}'.format(link['href'])
        })

    return house_state_list, senate_state_list

def get_district_pages(dict_page_url, year, state, district):
    """ 
        Recursively get all available previous election result pages
        for a given district
    """
    print('Will get house/{}/{}/{}.html'.format(state, district, year))
    # Create the directories if necessary
    if not os.path.exists('data/ballotpedia/house/'):
        os.makedirs('data/ballotpedia/house/')
    if not os.path.exists('data/ballotpedia/house/{}'.format(state)):
        os.makedirs('data/ballotpedia/house/{}'.format(state))
    if not os.path.exists('data/ballotpedia/house/{}/{}'.format(state, district)):
        os.makedirs('data/ballotpedia/house/{}/{}'.format(state, district))
    
    filename = 'data/ballotpedia/house/{}/{}/{}.html'.format(state, district, year)
    dict_soup = check_or_save_page(filename, dict_page_url)
    
    # Check if there is a link to a previous electoral year for this state
    table = dict_soup.find('table', { 'class': 'infobox' })
    div = table.find('div', style=lambda x: x and '#A3B1BF' in x and 'float:left;' in x)
            
    # If there is one
    if div is not None:
        # Extract the link election year
        prev_year = int(re.match(r'.*([1-2][0-9]{3})', div.text).group(1))
        
        if prev_year < year:                
            # Get the link to this disctict House election results parge
            link = div.find('a')
            #print(link['href'])

            # Get this page
            url = 'https://ballotpedia.org{}'.format(link['href'])
            get_district_pages(url, prev_year, state, district)

def get_house_senate_state_districts_list(house_state_list):
    start_year = 2018
    state_district_list = []
    for house_state in house_state_list:
        # To remove
        #if house_state['state'] != 'Maryland':
        #    continue

        filename = 'data/ballotpedia/2018_house_{}.html'.format(house_state['state'])
        
        # Check if the page has been stored on disk
        soup = check_or_save_page(filename, house_state['url'])
        #print(soup)
        
        # Get the district page links
        table = soup.find('table', { 'class': 'infobox' })
        
        links = table.find_all('a', href=lambda x: x and (
            '_Congressional_District_election,_' in x
        ))
        
        if len(links) == 0:
            title = soup.find('b', text=lambda x : x and 'District Pages' in x)
            links = title.parent.parent.find_all('a', href=lambda x: x and (
                '_Congressional_District_election,_' in x
            ))

        for link in links:
            print(link.text)
            url = 'https://ballotpedia.org{}'.format(link['href'])
            state_district_list.append({
                'state': house_state['state'],
                'district': link.text
            })
            #print(' |-', url)

            # Get the page
            get_district_pages(url, start_year, house_state['state'], link.text)
            
    return state_district_list

house_state_list, senate_state_list = get_house_senate_state_list()
state_district_list = get_house_senate_state_districts_list(house_state_list)

District 1
Will get house/Alabama/District 1/2018.html
Will get house/Alabama/District 1/2016.html
Will get house/Alabama/District 1/2014.html
Will get house/Alabama/District 1/2012.html
District 2
Will get house/Alabama/District 2/2018.html
Will get house/Alabama/District 2/2016.html
Will get house/Alabama/District 2/2014.html
Will get house/Alabama/District 2/2012.html
District 3
Will get house/Alabama/District 3/2018.html
Will get house/Alabama/District 3/2016.html
Will get house/Alabama/District 3/2014.html
Will get house/Alabama/District 3/2012.html
District 4
Will get house/Alabama/District 4/2018.html
Will get house/Alabama/District 4/2016.html
Will get house/Alabama/District 4/2014.html
Will get house/Alabama/District 4/2012.html
District 5
Will get house/Alabama/District 5/2018.html
Will get house/Alabama/District 5/2016.html
Will get house/Alabama/District 5/2014.html
Will get house/Alabama/District 5/2012.html
District 6
Will get house/Alabama/District 6/2018.html
Will get h

## Extract the House election results for every districts and years

In [389]:
def extract_district_data(state_district_list):
    results = []
    undesirable_chars = ['\*', '%', 'Incumbent', '\(D\)', '\(R\)']
    for item in state_district_list:
        # To remove
        #if item['state'] != 'New Hampshire' or item['district'] != 'District 2':
        if item['state'] != 'Wyoming':
            continue

        # Get the pages
        directory = 'data/ballotpedia/house/{}/{}'.format(item['state'], item['district'])
        files = [f for f in listdir(directory) if isfile(join(directory, f))]
        
        # For each year, get the district data
        for file in files:
            # Extract the year
            year = int(re.match(r'.*([1-2][0-9]{3})', file).group(1))
            candidate_rows = []
            
            # To remove
            #if year != 2018:
            #    continue
            
            # Get the page content
            filename = 'data/ballotpedia/house/{}/{}/{}'.format(item['state'], item['district'], file)
            with open(filename) as my_file:
                soup = BeautifulSoup(my_file.read(), "html.parser")
            
            # The 2018 pages requires a different approach
            if year == 2018:
                #print(2018)
                # Find the result table
                table = soup.find('table',  { 'class': 'results_table' })
                rows = table.find_all('tr')
                
                for row in rows:
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    cols = [ele for ele in cols if ele] # Get rid of empty values
                    
                    # Check if is incumbant
                    incumbent = 1 if row.find('b') and row.find('b').find('u') else 0
                    #print(cols)
                    if len(cols) == 4 and cols[0] == '✔':
                        is_winner = 1
                        name = cols[1] +' Incumbent' if incumbent == 1 else cols[1]
                        percent = cols[2] if len(cols) > 1 else np.NaN
                        votes = cols[3] if len(cols) > 2 else np.NaN
                        party = 'Democratic' if '(D)' in cols[1] else 'Republican'
                        candidate_rows.append([party, name, percent, votes, is_winner])
                        
                    elif len(cols) == 3 and '(D)' in cols[0] or '(R)' in cols[0]:
                        is_winner = 0
                        name = cols[0] +' Incumbent' if incumbent == 1 else cols[0]
                        percent = cols[1] if len(cols) > 1 else np.NaN
                        
                        votes = cols[2] if len(cols) > 2 else np.NaN
                        if len(cols) > 1:
                            party = 'Democratic' if '(D)' in cols[0] else 'Republican'
                        else:
                            party = np.NaN
                            
                        candidate_rows.append([party, name, percent, votes, is_winner])
                
            else:            
                # Find the result table
                th = soup.find('th', colspan='5', style=lambda x: x and 'background-color:#444' in x)
                table = th.find_parent('table')
                #table_body = table.find('tbody')
                rows = table.find_all('tr')
                #print(rows)

                for row in rows:
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    cols = [ele for ele in cols if ele] # Get rid of empty values

                    # Ignore the rows not about the candidates
                    if 'Republican' not in cols and not 'Democratic' in cols:
                        continue

                    # Check if the candidate won the elections
                    is_winner = 1 if row.find('a', title="Won") else 0
                    cols.append(is_winner)
                    candidate_rows.append(cols)
                    
            # If there was only one candidate
            if len(candidate_rows) == 1:
                if type(candidate_rows[0][3]) is int:
                    candidate_rows[0].append(candidate_rows[0][3])
                    candidate_rows[0][3] = np.NaN

            for candidate in candidate_rows:
                #print(year, item['district'], candidate)
                
                # Get and format the candidate party
                candidate_party = 'R' if candidate[0] == 'Republican' else 'D'
                
                # Get and clean the candidate name
                candidate_name = re.sub("|".join(undesirable_chars), "", candidate[1]).rstrip()
                
                # Get and clean the candidate percent
                if type(candidate[2]) is str:
                    candidate_percent = float(candidate[2].replace('%', ''))
                else:
                    candidate_percent = candidate[2]
                
                # Get and clean the candidate vote
                if type(candidate[3]) is str:
                    candidate_vote = int(candidate[3].replace(',', ''))
                else:
                    candidate_vote = candidate[3]
                
                # Determine whether or not the candidate is incumbent
                candidate_is_incumbent = 1 if 'Incumbent' in candidate[1] else 0
                
                results.append({
                    'year': year,
                    'state': item['state'],
                    'district': item['district'] if item['district'] != 'General election' else 'At-Large',
                    'name': candidate_name,
                    'party': candidate_party,
                    'percent': candidate_percent,
                    'votes': candidate_vote,
                    'is_incumbent': candidate_is_incumbent,
                    'won': candidate[4]
                })
                
                #print(results)
                #print('')

        #soup = BeautifulSoup(my_file.read(), "html.parser")
        #print(soup)
    return results

district_election_results = extract_district_data(state_district_list)
district_election_results_df = pd.DataFrame(district_election_results)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(district_election_results_df)

Unnamed: 0,district,is_incumbent,name,party,percent,state,votes,won,year
0,At-Large,1,Cynthia Lummis,R,69.0,Wyoming,166452,1,2012
1,At-Large,0,Chris Henrichsen,D,23.9,Wyoming,57573,0,2012
2,At-Large,1,Cynthia Lummis,R,68.5,Wyoming,113038,1,2014
3,At-Large,0,Richard Grayson,D,22.9,Wyoming,37803,0,2014
4,At-Large,0,Liz Cheney,R,62.0,Wyoming,156176,1,2016
5,At-Large,0,Ryan Greene,D,30.0,Wyoming,75466,0,2016
6,At-Large,1,Liz Cheney,R,63.6,Wyoming,127830,1,2018
7,At-Large,0,Greg Hunter,D,29.8,Wyoming,59876,0,2018


In [286]:
type('sqd')

str

# Assemble the final dataset

In [None]:


dataset = pd.DataFrame(
    [
        [
            'Train',
            len(data_train.loc[data_train['Diagnosis'] == 1]),
            len(data_train.loc[data_train['Diagnosis'] == 2]),
            len(data_train.loc[data_train['Diagnosis'] == 3]),
            len(data_train.loc[data_train['Diagnosis'] == 2]) / len(data_train.loc[data_train['Diagnosis'] == 3])
        ],
        [
            'Test', 
            len(data_test.loc[data_test['Diagnosis'] == 1]),
            len(data_test.loc[data_test['Diagnosis'] == 2]),
            len(data_test.loc[data_test['Diagnosis'] == 3]),
            len(data_test.loc[data_test['Diagnosis'] == 2]) / len(data_test.loc[data_test['Diagnosis'] == 3])
        ]
    ], 
    columns=list(['year'])
)

In [None]:
display(dataset)