In [1]:
import argparse
import glob
import os
import re
import sys

import pandas as pd
import xlrd

sys.path.append('../')

## Parse in the XLS file
https://www.sitepoint.com/using-python-parse-spreadsheet-data/

To the best of our ability, we'll want to parse in a file similar to csv-candidates-2016.xls with the field names:
1. COUNTY_ID
2. COUNTY_NAME
3. CONTEST_ID
4. CONTEST_NAME
5. CANDIDATE_ID
6. CANDIDATE_NAME
7. INCUMBENT_FLAG
8. WRITE_IN_FLAG
9. PARTY_ID
10. PARTY_NAME
11. VOTE_TOTAL

In [2]:
def determine_incumbency_status(candidate_name):
    """
    Given a Candidate Name determine whether the candidate was the incumbent.
    
    Keyword Args:
        candidate_name: The Candidate's Name. If there's an "*", then the candidate is the incumbent.
                        In the CA SOS Data, the candidate is marked as an incumbent by putting a "*" in their name.
    Return:
        True if candidate is the incumbent, else False
    """
    if(re.search('\*', candidate_name)):
        return 'Y'
    else:
        return 'N'

In [3]:
def process_contest(rows, election_name, contest_name):
    """
    Process Contest Results From CA SOS Statewide Election Results
    
    Keyword Args:
      rows: List of Rows from an XLRD Worksheet Object.
            The first row should be the candidate names.
            The last row should be the 'Totals' (Votes) Row.
      election_name: The name of the broader general election (e.g. 2016 General)
      contest_name: The name of the specific contest within the election (e.g. Governor)
    Return:
      contest_results: Pandas DataFrame with the contest results unique at the:
                         1. contest_name
                         2. county_name
                         3. candidate_name
                       level.
    
    Note, this current implementation will only select the first two candidates.
    """
    # Create a Dictionary mapping
    #   1. Votes Column to Candidate
    #   2. Candidate to Party  
    candidate1 = rows[0][1].value
    candidate2 = rows[0][2].value
    party1 = rows[1][1].value
    party2 = rows[1][2].value

    vote_candidate_mapping = {
        'votes1': candidate1,
        'votes2': candidate2,
    }

    candidate_party_mapping = {
        candidate1: party1,
        candidate2: party2,
    }
    
    # Parse Values
    counties = []
    votes1 = []
    votes2 = []
    for r in range(len(rows)):
        if(re.search('percent', rows[r][0].value, re.IGNORECASE) or rows[r][0].value == ''):
            pass
        else:
            counties.append(rows[r][0].value)
            votes1.append(rows[r][1].value)
            votes2.append(rows[r][2].value)
            
    df = pd.DataFrame({
        'county_name': counties,
        'votes1': votes1,
        'votes2': votes2,
    })
    
    # Rename Columns to Actual Candidate Names before Reshaping
    df.rename(index=str, columns=vote_candidate_mapping, inplace=True)
    
    # Pivot the DataFrame to get at the county/candidate level
    contest_results = pd.melt(df, id_vars=['county_name'], var_name='candidate_name', value_name='vote_total')
    
    # Add Additional Columns
    contest_results['election_name'] = election_name
    contest_results['contest_name'] = contest_name
    contest_results['party_name'] = contest_results.candidate_name.map(candidate_party_mapping)
    contest_results['incumbent_flag'] = contest_results.candidate_name.apply(determine_incumbency_status)
    
    # Reorder Columns
    cols = [
        'election_name',
        'county_name',
        'contest_name',
        'candidate_name',
        'incumbent_flag',
        'party_name',
        'vote_total',
    ]

    contest_results = contest_results[cols]
    return contest_results

In [4]:
def get_district_metadata(rows):
    """
    Gets the Metadata associated with District Level Election Results from CA SOS Data
    
    Keyword Args:
      rows: List of Rows from an XLRD Worksheet Object.
            This should be a list of all rows from an Excel file containing Election Results
    Returns:
      district_metadata: Python Dictionary with the contest_name as keys and as values another dictionary with keys:
                           row_indices=(starting_index, end_index),
                           contest_name,
    """
    contest_name_idx = []
    total_votes_idx = []

    # Contest Name is inside the spreadsheet
    for r in range(len(rows)):
        contains_district = re.search('district', rows[r][0].value, re.IGNORECASE)
        contains_totals = re.search('totals', rows[r][0].value, re.IGNORECASE)
        if(contains_district and not(contains_totals)):
            contest_name_idx.append(r)
        if(contains_totals):
            total_votes_idx.append(r)

    # Assumes the voting results starts 2 rows after the District Name. This is very Hacky
    starting_idx = [i + 2 for i in contest_name_idx]
    
    district_metadata = {}
    for (cni, si, tvi) in zip(contest_name_idx, starting_idx, total_votes_idx):
        contest_name = rows[cni][0].value
        district_metadata[contest_name] = {'row_indices': (si, tvi + 1), 'contest_name': contest_name}
    
    return district_metadata

In [5]:
statewide_elections = [
    '19-governor.xls',
    '22-lieutenant-governor.xls',
    '25-secretary-of-state (1).xls',
    '28-controller.xls',
    '31-treasurer.xls',
    '34-attorney-general.xls',
    '37-insurance-commissioner.xls',
    '85-superintendent-of-public-instruction.xls',
]

district_elections = [
    '40-board-of-equalization.xls',
    '43-congress.xls',
    '58-state-senator.xls',
    '64-state-assemblymember.xls', 
]

In [6]:
# Hard Code Path to File for now
directory = '/mnt/c/Users/vla/git/datasci-congressional-data/src/casos/'
year = '2014'
dir_files = os.listdir(os.path.join(directory, year))
dir_files = [f for f in dir_files if f.endswith('.xls')]
# filename = '19-governor.xls'
# filename = '58-state-senator.xls'
# file = os.path.join(directory, year, filename)

In [7]:
dir_files

['19-governor.xls',
 '22-lieutenant-governor.xls',
 '25-secretary-of-state (1).xls',
 '28-controller.xls',
 '31-treasurer.xls',
 '34-attorney-general.xls',
 '37-insurance-commissioner.xls',
 '40-board-of-equalization.xls',
 '43-congress.xls',
 '58-state-senator.xls',
 '64-state-assemblymember.xls',
 '85-superintendent-of-public-instruction.xls']

In [8]:
# Hard Code Election Name for now as the Year + 'General'
election_name = '{} General'.format(year)

# Create Results List where each element will be a DataFrame of Contest Results
results = []
for filename in dir_files:
    file = os.path.join(directory, year, filename)
    workbook = xlrd.open_workbook(file)
    worksheet = workbook.sheet_by_index(0)  # By Default usually there's only one sheet. This is Hacky. How to be robust?
    rows = [worksheet.row(r) for r in range(worksheet.nrows)]

    if(filename in statewide_elections):
        # Extract Contest Name from Filename
        # Reg Exp: https://stackoverflow.com/questions/8199398/extracting-only-characters-from-a-string-in-python
        contest_name = re.findall(r"(?i)\b[a-z]+\b", filename)[0]
        contest_rows = rows
        contest_results = process_contest(rows=contest_rows, election_name=election_name, contest_name=contest_name)
        results.append(contest_results)
    elif(filename in district_elections):
        # For District Elections:
        district_metadata = get_district_metadata(rows)
        for key in district_metadata:
            contest_name = district_metadata[key]['contest_name']
            starting_idx = district_metadata[key]['row_indices'][0]
            ending_idx = district_metadata[key]['row_indices'][1]
            contest_rows = rows[starting_idx:ending_idx]
            contest_results = process_contest(rows=contest_rows, election_name=election_name, contest_name=contest_name)
            results.append(contest_results)

In [10]:
df = pd.concat(results, ignore_index=True)

In [11]:
df.head()

Unnamed: 0,election_name,county_name,contest_name,candidate_name,incumbent_flag,party_name,vote_total
0,2014 General,Alameda,governor,"Edmund G. ""Jerry"" Brown*",Y,DEM,293081
1,2014 General,Alpine,governor,"Edmund G. ""Jerry"" Brown*",Y,DEM,284
2,2014 General,Amador,governor,"Edmund G. ""Jerry"" Brown*",Y,DEM,5682
3,2014 General,Butte,governor,"Edmund G. ""Jerry"" Brown*",Y,DEM,29520
4,2014 General,Calaveras,governor,"Edmund G. ""Jerry"" Brown*",Y,DEM,6870


In [12]:
df.shape

(2050, 7)

In [17]:
df.to_excel('/mnt/c/Users/vla/git/datasci-congressional-data/src/casos/csv-candidates-2014.xls', index=False)