In [1]:
import psycopg2
import numpy as np
import pandas as pd

In [2]:
urls = [
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2016/20161108__ca__general.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2014/20141104__ca__general.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2012/20121106__ca__general.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2010/20101102__ca__general.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2008/20081104__ca__general__state_assembly.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2008/20081104__ca__general__state_senate.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2006/20061107__ca__general__state_assembly.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2006/20061107__ca__general__state_senate.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2004/20041102__ca__general__state_assembly.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2004/20041102__ca__general__state_senate.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2002/20021105__ca__general__state_assembly.csv',
    'https://raw.githubusercontent.com/openelections/openelections-data-ca/master/2002/20021105__ca__general__state_senate.csv'
]

In [4]:
dfs = []
for url in urls:
    df = pd.read_csv(url)
    df['year'] = int(url.split('/')[-1][0:4])
    dfs.append(df)

In [5]:
votes = pd.concat(dfs)
votes = votes[(votes['office'] == 'State Assembly') | (votes['office'] == 'State Senate')]
votes = votes.groupby(['candidate','office','district','party','year'], as_index=False).sum()

In [6]:
# Some manual corrections
votes['candidate'] = votes['candidate'].replace('Sally Lieber','Sally J. Lieber')
votes['candidate'] = votes['candidate'].replace('Nicole Parra','Nicole M. Parra')
votes['candidate'] = votes['candidate'].replace('Anthony Portantino','Anthony J. Portantino')
votes['candidate'] = votes['candidate'].replace('Isadore Hall, III','Isadore Hall')
votes['candidate'] = votes['candidate'].replace('Cameron Smyth','Cameron M. Smyth')
votes['candidate'] = votes['candidate'].replace('Alan Lowenthal','Alan S. Lowenthal')
votes['candidate'] = votes['candidate'].replace('Reggie Jones-Sawyer','Reginald Byron Jones-Sawyer')
votes['candidate'] = votes['candidate'].replace('Eric Linder','Eric F. Linder')

In [7]:
totals = votes.groupby(['office','district','year'], as_index=False).sum()
totals = totals.rename(columns={'votes': 'total_votes'})

In [8]:
votes = votes.merge(totals, on=['office','district','year'])
votes['vote_share'] = votes['votes']/votes['total_votes'] * 100

In [9]:
def get_office(string):
    if 'SENATE' in string.upper():
        return 'S'
    elif 'ASSEMBLY' in string.upper():
        return 'A'
    else:
        raise Exception

In [10]:
votes['office'] = votes['office'].apply(get_office)

In [11]:
winners = votes.groupby(['office','district','year'], as_index=False).max()[['office','district','year','votes']]
winners = winners.rename(columns={'votes': 'winning_votes'})

In [12]:
votes = votes.merge(winners, on=['office','district','year'])

In [13]:
def won(row):
    if row['votes'] == row['winning_votes']:
        return 1
    else:
        return 0

In [14]:
votes['winner'] = votes.apply(won, axis=1)

In [15]:
def is_incumbent(row):
    prev = votes[(votes['candidate'] == row['candidate']) & (votes['office'] == row['office']) \
                 & (votes['district'] == row['district']) & (votes['winner'] == 1)]
    
    if row['office'] == 'A':
        prev = prev[prev['year'] == (row['year'] - 2)]
    else:
        prev = prev[prev['year'] == (row['year'] - 4)]

    if not prev.empty:
        return 1
    else:
        return 0

In [16]:
votes['incumbent'] = votes.apply(is_incumbent, axis=1)

In [17]:
# This method won't give us incumbency status for 2002 Assembly and 2002-2004 Senate races, so we need to do that manually
missing_incumbency = votes[((votes['year'] <= 2004) & (votes['office'] == 'S')) \
                           | ((votes['year'] == 2002) & (votes['office'] == 'A'))]
len(missing_incumbency)

295

In [18]:
missing_incumbency.sort_values(['year','office','district']).to_csv('../data/missing_incumbency_votes.csv', index=False)

In [19]:
# Import back in the corrected votes
manual_votes = pd.read_csv('../data/manual_incumbency_votes.csv', dtype='str')
manual_votes.head()

Unnamed: 0,candidate,office,district,party,year,votes,total_votes,vote_share,winning_votes,winner,incumbent
0,Doug Thron,A,1,GRN,2002,15315,132039,11.5988458,64065,0,0
1,Patty Berg,A,1,DEM,2002,64065,132039,48.51975553,64065,1,0
2,Rob Brown,A,1,REP,2002,52659,132039,39.88139868,64065,0,0
3,Doug Kinyon,A,2,DEM,2002,34524,117881,29.28716248,79361,0,0
4,Doug La Malfa,A,2,REP,2002,79361,117881,67.32297826,79361,1,0


In [20]:
has_incumbency = votes[((votes['year'] > 2004) & (votes['office'] == 'S')) | ((votes['year'] > 2002) & (votes['office'] == 'A'))]

In [21]:
final_votes = has_incumbency.append(manual_votes)
assert(len(final_votes) == len(votes))

In [22]:
final_votes['district'] = final_votes['district'].apply(lambda s: int(s))
final_votes.to_csv('../data/votes.csv', index=False)