In [1]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)

from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import json
from pathlib import Path
import numpy as np
import os
from os import listdir
from os.path import isfile, join

Lets define some usefull functions we will use later:

In [2]:
def check_or_save_page(filename, url):
    """
        Check if the file exist, if not get the page
        from the url and store in on the disk
        Returns the file content as a soup
    """
    # Check if the page has been stored on disk
    if Path(filename).is_file() is False:
        #print('No page')
        # Get the page
        result = requests.get(url)
        with open(filename,'w') as outfile:
            outfile.write(result.text)
        time.sleep(2)
    #else:
        #print('We got it')
        
    with open(filename) as my_file:
        soup = BeautifulSoup(my_file.read(), "html.parser")
        
    return soup

# Data Collection - Web Scraping - Data Parsing 

In [3]:
# Declare global variables
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'District of Columbia']

# National unemployement rate by month from 1948 to 2018
# Source: https://data.bls.gov/pdq/SurveyOutputServlet
national_unemployement_rate = pd.read_csv('data/national_unemployement_1948_2018.csv')

# Get the presidental job approval
# Source: https://www.gallup.com
# https://news.gallup.com/interactives/185273/presidential-job-approval-center.aspx
with open('data/all_presidential_job_approval_gallup.json') as f:
    presidential_approval = json.load(f)
presidential_approval = presidential_approval['AllPresidents']['HistoricalPresident']
presidential_approval_df = pd.DataFrame.from_dict([x['PresidentData'] for x in presidential_approval])
display(presidential_approval_df.head())

Unnamed: 0,DatesinOffice,DaysInOffice,EndDate,FirstTermAverage,JobApprovalHigh,JobApprovalLow,OverallAverage,Party,PresidentName,SecondTermAverage,StartDate
0,2017-Present,658,,-,45.0,35.0,39.5,Rep.,Donald J. Trump,-,2017-01-20
1,2009-2017,2922,2017-01-20,48,67.0,40.0,48.0,Dem.,Barack Obama,47,2009-01-20
2,2001-2009,2922,2009-01-20,62.2,90.0,25.0,49.4,Rep.,George W. Bush,36.5,2001-01-20
3,1993-2001,2922,2001-01-20,49.6,73.0,37.0,55.1,Dem.,Bill Clinton,60.6,1993-01-20
4,1989-1993,1461,1993-01-20,60.9,89.0,29.0,60.9,Rep.,George H. W. Bush,-,1989-01-20


In [4]:
display(national_unemployement_rate.head())

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1948,3.4,3.8,4.0,3.9,3.5,3.6,3.6,3.9,3.8,3.7,3.8,4.0
1,1949,4.3,4.7,5.0,5.3,6.1,6.2,6.7,6.8,6.6,7.9,6.4,6.6
2,1950,6.5,6.4,6.3,5.8,5.5,5.4,5.0,4.5,4.4,4.2,4.2,4.3
3,1951,3.7,3.4,3.4,3.1,3.0,3.2,3.1,3.1,3.3,3.5,3.5,3.1
4,1952,3.2,3.1,2.9,2.9,3.0,3.0,3.2,3.4,3.1,3.0,2.8,2.7


In [5]:
# Get once the necessary pages
presidential_page = requests.get('https://en.wikipedia.org/wiki/United_States_presidential_election')

In [6]:
# List of the US presidents
president_elected_history = pd.read_csv('data/president_elected_history.csv', sep=';')
display(president_elected_history.head())

Unnamed: 0,year,president_elected,president_elected_party,can_be_re_elected
0,1824,John Quincy Adams,DR,1
1,1828,Andrew Jackson,D,1
2,1832,Andrew Jackson,D,0
3,1836,Martin Van Buren,D,1
4,1840,William Henry Harrison,W,1


In [7]:
"""
Get the national level factors
Source: https://en.wikipedia.org/wiki/United_States_presidential_election
"""

# From a tag, extract the number of seats
def extract_seats(tag):
    if tag.findAll('b'):
        d_seats = tag.b.extract().string
    elif tag.sup and tag.sup.decompose():
        d_seats = tag.sup.decompose()
    elif tag.string is None:
        d_seats = tag.text
    else:
        d_seats = tag.string
    return int(d_seats)

def extract_seats_change(tag):
    if tag.sup:
        d_seats_change = tag.text.split('[', 1)[0]
    else:
        d_seats_change = tag.text
    return int(d_seats_change.replace('–', '-'))

# Get the house election years
def extract_house_elections_history():
    house_elections_history = []

    # If the file doesn't exist, get the data from the webpage and store the content to a new file
    filename = 'data/list_of_house_elections_page.html'
    if Path(filename).is_file():
        with open(filename) as my_file:
            list_of_house_elections_page = my_file.read()
    else:
        print('no file')
        list_of_house_elections_page = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_House_of_Representatives_elections,_1856%E2%80%93present')
        with open(filename,'w') as outfile:
            outfile.write(list_of_house_elections_page.text)

    soup = BeautifulSoup(list_of_house_elections_page, "html.parser")

    # Find the election years
    data = []
    elections_pages = []
    for t in soup.find_all('a', title=lambda x: x and 'United States House of Representatives elections,' in x):
        if len(t.string) == 4:
            elections_pages.append({
                'year': int(t.string),
                'url':'https://en.wikipedia.org'+t.attrs['href']
            })
            year = int(t.string)+2

            cols = t.parent.parent.find_all('td')

            # Get the number of Democrat seats
            d_seats = extract_seats(cols[1])

            # Get the change in the number of Democrat seats
            d_seats_change = extract_seats_change(cols[2])

            # Get the number of Republican seats
            r_seats = extract_seats(cols[3])
            
            # Get the change in the number of Republican seats
            r_seats_change_by_year = extract_seats_change(cols[4])
            
            #print(1 if year in presidential_years else 0)

            idx = (np.abs(president_elected_history['year'].values-year+1)).argmin()
            president_can_be_re_elected = president_elected_history['can_be_re_elected'].loc[[idx]].values[0]
            president_party = president_elected_history['president_elected_party'].loc[[idx]].values[0]

            # Look for president overall job approval average
            president_name = president_elected_history['president_elected'].loc[[idx]].values[0]
            president_overall_avg_job_approval = presidential_approval_df.loc[presidential_approval_df['PresidentName'] == president_name]['OverallAverage']
            president_overall_avg_job_approval = float(president_overall_avg_job_approval.values[0])/100 if president_overall_avg_job_approval.values.size else None
            
            # Get the national unemployement rate for November
            oct_unemployement_rate = national_unemployement_rate.loc[national_unemployement_rate['Year'] == year]['Oct']
            
            oct_unemployement_rate = oct_unemployement_rate.values[0] if oct_unemployement_rate.values.size else None
            
            data.append({
                'year': year,
                'is_presidential_year': 1 if year in president_elected_history['year'].unique() else 0,
                'president_party': president_party,
                'president_can_be_re_elected': president_can_be_re_elected,
                'president_overall_avg_job_approval': president_overall_avg_job_approval,
                'oct_unemployement_rate': oct_unemployement_rate,
                'last_democrat_seats': d_seats,
                'last_republican seats': r_seats,
                'last_house_majority': 'R' if d_seats < r_seats else 'D'
            })

    return data, elections_pages

data, house_elections_pages = extract_house_elections_history()
data_df = pd.DataFrame(data)
national_level_factors = data_df[[
    'year', 
    'is_presidential_year', 
    'president_party', 
    'president_can_be_re_elected', 
    'president_overall_avg_job_approval', 
    'oct_unemployement_rate',
    'last_democrat_seats', 
    'last_republican seats', 
    'last_house_majority']]
display(national_level_factors.sort_values('year', ascending=False).head())

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 89968: character maps to <undefined>

In [None]:
"""
Get the state level factors
"""

# Historical presidential election results by state
# Source: https://en.wikipedia.org/wiki/List_of_United_States_presidential_election_results_by_state
election_results_df = pd.read_csv('data/presidential_election_results_by_state.csv')
election_results_df.head()

## wikipedia.org
### Get the House and Senate election result pages for all the available years

In [8]:
"""
Get the district level factor
"""
def get_district_list():
    district_list = []
    url = 'https://en.wikipedia.org/wiki/United_States_House_of_Representatives_elections,_2016'
    filename = 'data/wikipedia/all_state_districts_list_page.html'
    
    # Check if the page has been stored on disk
    soup = check_or_save_page(filename, url)
    
    # Find the districts page links
    districts = soup.find_all('a', href=re.compile(r'(.*\/wiki\/.* )|(.*_congressional_district)'))
    
    for district in districts:
        if any(substring in district.string for substring in states) \
        and district.string not in district_list \
        and "'s" not in district.string \
        and "12th" not in district.string \
        and '1st' not in district.string:
            district_state = ''
            # Get the corresponding state
            for state in states:
                if state in district.string:
                    district_state = state
            
            # Format the district name
            if 'at-large' in district.string:
                dist_name = 'At-Large'
            else:
                # Find the district number
                dist_number = [int(s) for s in district.string.split() if s.isdigit()]
                if len(dist_number) > 0:
                    dist_number = dist_number[0]
                    dist_name = 'District {}'.format(dist_number)
                else:
                    continue
                
            #print(district_state, dist_name)
            
            district_list.append({
                'name': dist_name,
                'page_url': 'https://en.wikipedia.org{}'.format(district['href']),
                'state': district_state
            })
            
    # Remove duplicate in the list
    district_list = [dict(t) for t in {tuple(d.items()) for d in district_list}]
    
    return district_list

def get_wiki_district_pages(districts):
    # Get the district pages if they have not been stored on disk yet
    for district in districts:
        filename = 'data/district_pages/{}.html'.format(district['name'])

        # Check if the page has been stored on disk
        check_or_save_page(filename, district['page_url'])
        
def parse_district_house_results(filename, district, state):
    undesirable_chars = ['\*', '%', '\(incumbent\)', '\(inc.\)', '\(write-in\)']
    district_house_results = []
    with open(filename) as my_file:
        soup = BeautifulSoup(my_file.read(), "html.parser")
        
        # Find the election results tables
        caption = soup.find_all('caption')
        elems = []
        for capt in caption:
            x = capt.get_text()
            if ('United States House of Representatives elections,' in x or
                'congressional district election' in x or
                'US House election, ' in x or
                'Congressional District House Election'
            ):
                elems.append(capt)

        for capt in elems:
            # Find the date
            match = re.match(r'.*([1-2][0-9]{3})', capt.text)
            if match is None:
                continue

            # Then it found a match!
            year = int(match.group(1))
            #print(year)

            # Get the result table itself
            table = capt.find_parent('table')
            table_body = table.find('tbody')
            rows = table_body.find_all('tr')

            for row in rows:
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                cols = [ele for ele in cols if ele] # Get rid of empty values

                if len(cols) and cols[0] in ['Republican', 'Democratic']:
                    print(cols)
                    
                    percent = np.NaN
                    if len(cols) > 3 and cols[3] != 'N/A':
                        percent = float(re.sub("|".join(undesirable_chars), "", cols[3]))/100
                    
                    votes = np.NaN
                    if len(cols) > 2 and cols[2] == 'N/A':
                        votes = np.NaN
                    elif len(cols) > 2 and '%' not in cols[2] and cols[2] != '100.00':
                        votes = int(cols[2].replace(',', '').replace('.', ''))
                    elif len(cols) > 2 and ('%' in cols[2] or cols[2] == '100.00'):
                        percent = float(re.sub("|".join(undesirable_chars), "", cols[2]))/100
                    
                    district_house_results.append({
                        'year': year,
                        'candidate_party': 'R' if cols[0] == "Republican" else 'D',
                        'candidate_name': re.sub("|".join(undesirable_chars), "", cols[1]),
                        'votes': votes,
                        'percent': percent
                    })
                    
    return pd.DataFrame(district_house_results)
    
def get_district_level_factors(district):
    state = districts_df.loc[districts_df['name'] == district]['state'].values[0]
    dist_level_factors = []
    # Get the page of the district
    html_filename = 'data/district_pages/{}.html'.format(district)
    json_filename = 'data/district_pages/{}.json'.format(district)
    
    # If there is no already formated data, get them from the corresponding Wikipedia page
    if Path(json_filename).is_file() is False:
        district_house_results = parse_district_house_results(html_filename, district, state)
        display(district_house_results)
    else:
        print('get from json')
        district_house_results = pd.read_json(json_filename)
        display(district_house_results)
        
    # Now, for each year
    for year in district_house_results['year'].unique():
        # If there is more than 1 candidate this year
        # Get the current year
        if len(district_house_results.loc[district_house_results['year'] == year]) > 1:
            curr_year_idx = district_house_results.loc[district_house_results['year'] == year]['votes'].idxmax()
            curr_year = district_house_results.loc[[curr_year_idx]]
        else:
            curr_year = district_house_results        
        
        # Get previous year
        prev_year = district_house_results.loc[district_house_results['year'] == year-2]

        if prev_year.empty is False:
            # If there is more than 1 candidate the previous year
            if len(prev_year) > 1:
                prev_year_winner_idx = prev_year['votes'].idxmax()
                prev_year_winner = prev_year.loc[[prev_year_winner_idx]]
            else:
                prev_year_winner = prev_year

            # Get the incumbent name
            incumbent = prev_year_winner['candidate_name'].values[0]

            # Get the elections previously won by the incumbent
            incumbent_history = district_house_results.loc[
                (district_house_results['candidate_name'] == incumbent) &
                (district_house_results['year'] < year)
            ]

            incubent_first_elected_idx = incumbent_history['year'].idxmin()
            incubent_first_elected = incumbent_history.loc[[incubent_first_elected_idx]]['year'].values[0]
            incubent_is_candidate = curr_year.loc[curr_year['candidate_name'] == incumbent].empty

            dist_data = {
                'year': year,
                'state': state,
                'district': district,
                'incumbent': incumbent,
                'incumbent_party': 'R' if prev_year_winner['candidate_party'].values[0] == "Republican" else 'D',
                'incumbent_count_victories': len(incumbent_history),
                'incumbent_first_elected': incubent_first_elected,
                'incumbent_running_re_election': 0 if incubent_is_candidate else 1,
                'candidate_elected_party': curr_year['candidate_party'].values[0]
            }

            dist_level_factors.append(dist_data)
        else:
            print('yop')
            dist_data = {
                'year': year,
                'state': state,
                'district': district,
                'incumbent': np.NaN,
                'incumbent_party': np.NaN,
                'incumbent_count_victories': np.NaN,
                'incumbent_first_elected': np.NaN,
                'incumbent_running_re_election': np.NaN,
                'candidate_elected_party': curr_year['candidate_party'].values[0]
            }
            dist_level_factors.append(dist_data)

    return dist_level_factors

#for district in ['Alabama 1', 'Alabama 2']:
#for district in ['Arkansas 1']:
#    district_level_factors = get_district_level_factors(district)
#    display(pd.DataFrame(district_level_factors).sort_values('year', ascending=True))
#    #display(district_level_factors)

def get_wiki_districts_house_results(districts_list):
    candidate_results = []
    wiki_undesirable_chars = [
        '\*', '%', '\(Incumbent\)', '\(incumbent\)', '\(inc.\)', '\(write-in\)', 
        '\(as a write-in\)'
    ]
    for district in districts_list:
        # To remove
        #if district['state'] != 'Texas' or district['name'] != 'District 17':
        #if district['state'] != 'Wyoming':
        #    continue
        
        print('Will get results for house/{}/{}.html'.format(district['state'], district['name']))
        print('Source: {}'.format(district['page_url']))
        
        # In some cases, the wikipedia page is too messy to crawl
        # So I manually gather the informations into a json file
        # If this file exist, it will be prefered
        json_filename = 'data/wikipedia/house/{}/{}.json'.format(district['state'], district['name'])
        if Path(json_filename).is_file() is True:
            print('Data are store in a formated JSON')
            continue
        
        # Create the directories if necessary
        if not os.path.exists('data/wikipedia/house'):
            os.makedirs('data/wikipedia/house')
        if not os.path.exists('data/wikipedia/house/{}'.format(district['state'])):
            os.makedirs('data/wikipedia/house/{}'.format(district['state']))
            
        filename = 'data/wikipedia/house/{}/{}.html'.format(district['state'], district['name'])
        
        # Check if the page has been stored on disk
        soup = check_or_save_page(filename, district['page_url'])
        
        # Find the results tables
        caption = soup.find_all('caption')
        tables = []
        for capt in caption:
            x = capt.get_text()
            if ('United States House of Representatives elections,' in x or
                'congressional district election' in x or
                'US House election, ' in x or
                'Congressional District House Election'
            ):
                # print(capt)
                table = capt.find_parent('table')
                tables.append(table)
        
        # For each result table, extract the results
        for table in tables:
            # Get the year
            table_title = table.find('caption')
            
            # If this is a table about a special election, skip it
            if 'Special' in table_title.text:
                continue
            
            year_match = re.match(r'.*([1-2][0-9]{3})', table_title.text)
            
            # If there is no year match, then this table isn't of interest
            if year_match is None:
                continue
            
            year = int(year_match.group(1))
            # print(year)
            
            # Get the result table itself
            rows = table.find('tbody').find_all('tr')
            candidate_rows = []
            for row in rows:
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                # If all the values of the cols are empty strings, continue
                if all(v is '' for v in cols):
                    continue
                    
                # print(cols)
                
                # If this row contains a candidate results
                if len(cols) > 2 and cols[1] in ['Republican', 'Democratic']:
                    # print(cols)
                    
                    party = 'R' if cols[1] == 'Republican' else 'D'
                    name = cols[2]
                    votes = int(cols[3].replace(',', '').replace('[8]', '').replace('c', '').replace('.', '').replace(' ', '')) if cols[3] != '' else np.NaN
                    percent = float(cols[4].replace('%', '')) if cols[4] != '' else np.NaN
                    
                    candidate_rows.append({
                        'year': year,
                        'state': district['state'],
                        'district': district['name'],
                        'is_incumbent': np.NaN,
                        'name': name,
                        'party': party,
                        'percent': percent,
                        'votes': votes,
                        'won': 0
                    })
                    
            # If we found no candidate data, continue
            if len(candidate_rows) == 0:
                continue
            
            # Enrich the candidates data
            max_percent = max([x['percent'] for x in candidate_rows])
            for candidate in candidate_rows:
                # Check if the candidate won the elections
                if candidate['percent'] == max_percent:
                    candidate['won'] = 1
                    
                # Check if we can determine if the candidate is an incumbent
                if '(inc.)' in candidate['name'] or '(incumbent)' in candidate['name'] or '(Incumbent)' in candidate['name']:
                    candidate['is_incumbent'] = 1

                # Clean the candidate name
                candidate['name'] = re.sub("|".join(wiki_undesirable_chars), "", candidate['name'])
                    
                candidate_results.append(candidate)
                
            # If we found that one of the candidates is an incumbent, the others are sets to 0
            max_incumbent = max([x['is_incumbent'] for x in candidate_rows])
            #print(type(max_incumbent))
            if max_incumbent == 1:
                for candidate in candidate_rows:
                    candidate['is_incumbent'] = 0 if candidate['is_incumbent'] != 1 else 1

    return candidate_results

districts_list = get_district_list()
# districts_df = pd.DataFrame(districts_list)
# display(districts_df.loc[districts_df['state'] == 'Wyoming'])

wiki_house_history = get_wiki_districts_house_results(districts_list)
# Store in disk
wiki_house_history_df = pd.DataFrame(wiki_house_history)
wiki_house_history_df.to_csv('data/wikipedia/house_results.csv', encoding='utf-8')

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1458927: character maps to <undefined>

**We now have on disk ALL the available historical district results from Wikipedia**

Lets take a look:

In [9]:
test_df = pd.read_csv('data/wikipedia/house_results.csv', index_col=0)
display(test_df)

Unnamed: 0,district,is_incumbent,name,party,percent,state,votes,won,year
0,District 19,,Randy Neugebauer,R,85.00,Texas,160136.0,1,2012
1,District 19,,Randy Neugebauer,R,78.00,Texas,106059.0,1,2010
2,District 19,,Andy Wilson,D,19.00,Texas,25984.0,0,2010
3,District 19,,Randy Neugebauer,R,72.00,Texas,168501.0,1,2008
4,District 19,,Dwight Fullingim,D,25.00,Texas,58030.0,0,2008
5,District 19,,Randy Neugebauer,R,68.00,Texas,92811.0,1,2006
6,District 19,,Robert Ricketts,D,30.00,Texas,40853.0,0,2006
7,District 19,,Randy Neugebauer,R,58.40,Texas,136459.0,1,2004
8,District 19,,Charles Stenholm,D,40.00,Texas,93531.0,0,2004
9,District 42,1.0,Clair Burgener,R,67.40,California,155965.0,1,1972


## ballotpedia.org

So far so good but the 2018 results are missing on Wikipedia and the available data are not always exhaustives. So I decided to get the same informations from a different source: Ballotpedia.  
Here we have the complete 2018 results as well as historical date from 2012.  
Note that the incumbent information is consistent.  

### Get the House and Senate election result pages for all the available years

In [10]:
def get_house_senate_state_list():
    house_state_list = []
    senate_state_list = []
    filename = 'data/ballotpedia/house_state_list_src.html'
    url = 'https://ballotpedia.org/U.S._House_battlegrounds,_2018'
    
    # Check if the page has been stored on disk
    soup = check_or_save_page(filename, url)
    
    # Find the list of the U.S. Senate Elections by State (2018) pages
    table = soup.find('table', { 'class': 'infobox' })
    for link in table.find_all('a', href=lambda x: x and '/United_States_Senate_election_in_' in x):
        senate_state_list.append({
            'state': link.text,
            'url': 'https://ballotpedia.org{}'.format(link['href'])
        })

    # Find the list of the U.S. House Elections by State (2018) pages
    table = soup.find('table', { 'class': 'infobox' })
    for link in table.find_all('a', href=lambda x: x and (
        '/United_States_House_of_Representatives_election_in_' in x or
        '/United_States_House_of_Representatives_elections_in_' in x
    )):
        house_state_list.append({
            'state': link.text,
            'url': 'https://ballotpedia.org{}'.format(link['href'])
        })

    return house_state_list, senate_state_list

def get_district_pages(dict_page_url, year, state, district):
    """ 
        Recursively get all available previous election result pages
        for a given district
    """
    print('Will get house/{}/{}/{}.html'.format(state, district, year))
    # Create the directories if necessary
    if not os.path.exists('data/ballotpedia/house/'):
        os.makedirs('data/ballotpedia/house/')
    if not os.path.exists('data/ballotpedia/house/{}'.format(state)):
        os.makedirs('data/ballotpedia/house/{}'.format(state))
    if not os.path.exists('data/ballotpedia/house/{}/{}'.format(state, district)):
        os.makedirs('data/ballotpedia/house/{}/{}'.format(state, district))
    
    filename = 'data/ballotpedia/house/{}/{}/{}.html'.format(state, district, year)
    dict_soup = check_or_save_page(filename, dict_page_url)
    
    # Check if there is a link to a previous electoral year for this state
    table = dict_soup.find('table', { 'class': 'infobox' })
    div = table.find('div', style=lambda x: x and '#A3B1BF' in x and 'float:left;' in x)
            
    # If there is one
    if div is not None:
        # Extract the link election year
        prev_year = int(re.match(r'.*([1-2][0-9]{3})', div.text).group(1))
        
        if prev_year < year:                
            # Get the link to this disctict House election results parge
            link = div.find('a')
            #print(link['href'])

            # Get this page
            url = 'https://ballotpedia.org{}'.format(link['href'])
            get_district_pages(url, prev_year, state, district)

def get_house_senate_state_districts_list(house_state_list):
    start_year = 2018
    state_district_list = []
    for house_state in house_state_list:
        # To remove
        #if house_state['state'] != 'Maryland':
        #    continue

        filename = 'data/ballotpedia/2018_house_{}.html'.format(house_state['state'])
        
        # Check if the page has been stored on disk
        soup = check_or_save_page(filename, house_state['url'])
        #print(soup)
        
        # Get the district page links
        table = soup.find('table', { 'class': 'infobox' })
        
        links = table.find_all('a', href=lambda x: x and (
            '_Congressional_District_election,_' in x
        ))
        
        if len(links) == 0:
            title = soup.find('b', text=lambda x : x and 'District Pages' in x)
            links = title.parent.parent.find_all('a', href=lambda x: x and (
                '_Congressional_District_election,_' in x
            ))

        for link in links:
            print(link.text)
            url = 'https://ballotpedia.org{}'.format(link['href'])
            state_district_list.append({
                'state': house_state['state'],
                'district': link.text
            })
            #print(' |-', url)

            # Get the page
            get_district_pages(url, start_year, house_state['state'], link.text)
            
    return state_district_list

house_state_list, senate_state_list = get_house_senate_state_list()
state_district_list = get_house_senate_state_districts_list(house_state_list)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 246930: character maps to <undefined>

### Extract the House election results for every districts and years

In [11]:
def extract_district_data(state_district_list):
    results = []
    undesirable_chars = ['\*', '%', 'Incumbent', '\(D\)', '\(R\)']
    for item in state_district_list:
        # To remove
        #if item['state'] != 'New Hampshire' or item['district'] != 'District 2':
        #if item['state'] != 'Wyoming':
        #    continue

        # Get the pages
        directory = 'data/ballotpedia/house/{}/{}'.format(item['state'], item['district'])
        files = [f for f in listdir(directory) if isfile(join(directory, f))]
        
        # For each year, get the district data
        for file in files:
            # Extract the year
            year = int(re.match(r'.*([1-2][0-9]{3})', file).group(1))
            candidate_rows = []
            
            # To remove
            #if year != 2018:
            #    continue
            
            # Get the page content
            filename = 'data/ballotpedia/house/{}/{}/{}'.format(item['state'], item['district'], file)
            with open(filename) as my_file:
                soup = BeautifulSoup(my_file.read(), "html.parser")
            
            # The 2018 pages requires a different approach
            if year == 2018:
                #print(2018)
                # Find the result table
                table = soup.find('table',  { 'class': 'results_table' })
                rows = table.find_all('tr')
                
                for row in rows:
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    cols = [ele for ele in cols if ele] # Get rid of empty values
                    
                    # Check if is incumbant
                    incumbent = 1 if row.find('b') and row.find('b').find('u') else 0
                    #print(cols)
                    if len(cols) == 4 and cols[0] == '✔':
                        is_winner = 1
                        name = cols[1] +' Incumbent' if incumbent == 1 else cols[1]
                        percent = cols[2] if len(cols) > 1 else np.NaN
                        votes = cols[3] if len(cols) > 2 else np.NaN
                        party = 'Democratic' if '(D)' in cols[1] else 'Republican'
                        candidate_rows.append([party, name, percent, votes, is_winner])
                        
                    elif len(cols) == 3 and '(D)' in cols[0] or '(R)' in cols[0]:
                        is_winner = 0
                        name = cols[0] +' Incumbent' if incumbent == 1 else cols[0]
                        percent = cols[1] if len(cols) > 1 else np.NaN
                        
                        votes = cols[2] if len(cols) > 2 else np.NaN
                        if len(cols) > 1:
                            party = 'Democratic' if '(D)' in cols[0] else 'Republican'
                        else:
                            party = np.NaN
                            
                        candidate_rows.append([party, name, percent, votes, is_winner])
                
            else:            
                # Find the result table
                th = soup.find('th', colspan='5', style=lambda x: x and 'background-color:#444' in x)
                table = th.find_parent('table')
                #table_body = table.find('tbody')
                rows = table.find_all('tr')
                #print(rows)

                for row in rows:
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    cols = [ele for ele in cols if ele] # Get rid of empty values

                    # Ignore the rows not about the candidates
                    if 'Republican' not in cols and not 'Democratic' in cols:
                        continue

                    # Check if the candidate won the elections
                    is_winner = 1 if row.find('a', title="Won") else 0
                    cols.append(is_winner)
                    candidate_rows.append(cols)
                    
            # If there was only one candidate
            if len(candidate_rows) == 1:
                if type(candidate_rows[0][3]) is int:
                    candidate_rows[0].append(candidate_rows[0][3])
                    candidate_rows[0][3] = np.NaN

            for candidate in candidate_rows:
                #print(year, item['district'], candidate)
                
                # Get and format the candidate party
                candidate_party = 'R' if candidate[0] == 'Republican' else 'D'
                
                # Get and clean the candidate name
                candidate_name = re.sub("|".join(undesirable_chars), "", candidate[1]).rstrip()
                
                # Get and clean the candidate percent
                if type(candidate[2]) is str:
                    candidate_percent = float(candidate[2].replace('%', ''))
                else:
                    candidate_percent = candidate[2]
                
                # Get and clean the candidate vote
                if type(candidate[3]) is str:
                    candidate_vote = int(candidate[3].replace(',', ''))
                else:
                    candidate_vote = candidate[3]
                
                # Determine whether or not the candidate is incumbent
                candidate_is_incumbent = 1 if 'Incumbent' in candidate[1] else 0
                
                results.append({
                    'year': year,
                    'state': item['state'],
                    'district': item['district'] if item['district'] != 'General election' else 'At-Large',
                    'name': candidate_name,
                    'party': candidate_party,
                    'percent': candidate_percent,
                    'votes': candidate_vote,
                    'is_incumbent': candidate_is_incumbent,
                    'won': candidate[4]
                })
                
                #print(results)
                #print('')

        #soup = BeautifulSoup(my_file.read(), "html.parser")
        #print(soup)
    return results

ballo_house_history = extract_district_data(state_district_list)

# Store on disk
ballo_house_history_df = pd.DataFrame(ballo_house_history)
ballo_house_history_df.to_csv('data/ballotpedia/ballo_results.csv', encoding='utf-8')

NameError: name 'state_district_list' is not defined

## Merge the data from wikipedia.org and  ballotpedia.org

Now we have two dataset with the same columns and some overlaping data. Its time to merge them.  
It appears that the data from ballotpedia.org are more consistent so we will favor them.

In [24]:
ballo_df = pd.read_csv('data/ballotpedia/ballo_results.csv', index_col=0)
wikipedia_df = pd.read_csv('data/wikipedia/house_results.csv', index_col=0)

merged_df =  pd.concat([wikipedia_df.loc[wikipedia_df['year'] < 2012], ballo_df])

unemp_df = pd.read_csv('data/unemployment/unemp_2012_2017.csv',sep=';')
unemp_df = unemp_df[['year','state','district','unemp_rate_16']]
unemp_df['state'] = unemp_df['state'].str.rsplit(',').str[-1].str.strip()
unemp_df['district'] = unemp_df.district.str.extract('(\d+)').astype(int)
unemp_df = unemp_df[unemp_df['district'] < 100]
unemp_df['district'] = 'District ' + unemp_df['district'].astype(str)

merged_df = merged_df.join(unemp_df.set_index(['year', 'state', 'district']), on=['year', 'state', 'district']).copy()
merged_df.head()

Unnamed: 0,district,is_incumbent,name,party,percent,state,votes,won,year,unemp_rate_16
1,District 19,,Randy Neugebauer,R,78.0,Texas,106059.0,1,2010,
2,District 19,,Andy Wilson,D,19.0,Texas,25984.0,0,2010,
3,District 19,,Randy Neugebauer,R,72.0,Texas,168501.0,1,2008,
4,District 19,,Dwight Fullingim,D,25.0,Texas,58030.0,0,2008,
5,District 19,,Randy Neugebauer,R,68.0,Texas,92811.0,1,2006,


## Derive from the data

# Assemble the final dataset

In [None]:
merged_df