In [25]:
import psycopg2
import numpy as np
import pandas as pd
from datetime import date
from fuzzywuzzy import process

In [26]:
expenditures = pd.read_csv('../data/expenditures.csv')
expenditures.head()

Unnamed: 0,candidate_name,candidate_id,year,office,district,TOT,FND,COMM,ADS,INFO,...,CONTRIB,GENERAL,LOGTOT,LOGFND,LOGCOMM,LOGADS,LOGINFO,LOGOVERHEAD,LOGCONTRIB,LOGGENERAL
0,"GAINES, EDWARD T",1265444,2016,S,1,722100.8,53889.23,68719.09,0.0,222241.57,...,32700.0,344849.89,13.48992,10.894686,11.137782,0.0,12.31152,11.288125,10.39513,12.750864
1,"ROWEN, ROBERT J.",1383735,2016,S,1,16827.47,100.0,640.18,5128.0,400.0,...,0.0,6268.18,9.730768,4.60517,6.461749,8.542471,5.991465,8.856966,0.0,8.743241
2,"DODD, BILL",1359048,2016,S,3,4098682.81,181669.96,397477.07,886875.0,206297.6,...,74054.17,1672319.63,15.226176,12.109947,12.892893,13.695459,12.237075,14.230264,11.212552,14.329722
3,"YAMADA, MARIKO M.",1295701,2016,S,3,581241.54,5311.44,147757.78,500.0,258833.45,...,11750.0,412402.67,13.272922,8.577618,11.90333,6.214608,12.46394,11.694464,9.371609,12.929756
4,"GALGIANI, CATHLEEN",1273495,2016,S,5,1914684.12,47866.5,536197.19,577.46,374323.28,...,16110.0,958964.43,14.465063,10.776171,13.192257,6.358639,12.832875,13.077864,9.687195,13.773609


In [27]:
votes = pd.read_csv('../data/votes.csv', dtype='str')
votes['district'] = votes['district'].apply(lambda s: int(s.split('.')[0]))
votes['year'] = votes['year'].apply(lambda s: int(s))
votes.head()

Unnamed: 0,candidate,office,district,party,year,votes,total_votes,vote_share,winning_votes,winner,incumbent
0,Abel Guillen,A,18,DEM,2012,74422,150287,49.51991855582985,75865,0,0
1,Rob Bonta,A,18,DEM,2012,75865,150287,50.48008144417015,75865,1,0
2,Abel Maldonado,S,15,REP,2008,222617,353846,62.91352735370754,222617,1,1
3,Jim Fitzgerald,S,15,IND,2008,131229,353846,37.08647264629246,222617,0,0
4,Abigail Medina,A,40,DEM,2016,74589,151126,49.35550467821553,76537,0,0


## Match expenditures with votes

In [28]:
def guess_name(row):
    try:
        # Rearrange into 'First Last'
        # This probably isn't strictly necessary, but makes me feel better
        name_parts = row['candidate_name'].split(',')
        name = name_parts[1] + ' ' + name_parts[0]
        # Get rows from votes data for this race
        district = int(row['district'])
        d = votes[(votes['office'] == row['office']) & (votes['district'] == row['district']) & (votes['year'] == row['year'])]
        choices = d['candidate']
        guess = process.extractOne(name, choices)
        return guess[0]
    except:
        return ''

In [29]:
expenditures['votes_name'] = expenditures.apply(guess_name, axis=1)

In [30]:
expenditures.head()

Unnamed: 0,candidate_name,candidate_id,year,office,district,TOT,FND,COMM,ADS,INFO,...,GENERAL,LOGTOT,LOGFND,LOGCOMM,LOGADS,LOGINFO,LOGOVERHEAD,LOGCONTRIB,LOGGENERAL,votes_name
0,"GAINES, EDWARD T",1265444,2016,S,1,722100.8,53889.23,68719.09,0.0,222241.57,...,344849.89,13.48992,10.894686,11.137782,0.0,12.31152,11.288125,10.39513,12.750864,Ted Gaines
1,"ROWEN, ROBERT J.",1383735,2016,S,1,16827.47,100.0,640.18,5128.0,400.0,...,6268.18,9.730768,4.60517,6.461749,8.542471,5.991465,8.856966,0.0,8.743241,Rob Rowen
2,"DODD, BILL",1359048,2016,S,3,4098682.81,181669.96,397477.07,886875.0,206297.6,...,1672319.63,15.226176,12.109947,12.892893,13.695459,12.237075,14.230264,11.212552,14.329722,Bill Dodd
3,"YAMADA, MARIKO M.",1295701,2016,S,3,581241.54,5311.44,147757.78,500.0,258833.45,...,412402.67,13.272922,8.577618,11.90333,6.214608,12.46394,11.694464,9.371609,12.929756,Mariko Yamada
4,"GALGIANI, CATHLEEN",1273495,2016,S,5,1914684.12,47866.5,536197.19,577.46,374323.28,...,958964.43,14.465063,10.776171,13.192257,6.358639,12.832875,13.077864,9.687195,13.773609,Cathleen Galgiani


In [31]:
votes = votes.rename(columns={'candidate': 'votes_name'})
cand_exp_votes = expenditures.merge(votes, on=['votes_name','office','district','year'])
cand_exp_votes.head()

Unnamed: 0,candidate_name,candidate_id,year,office,district,TOT,FND,COMM,ADS,INFO,...,LOGCONTRIB,LOGGENERAL,votes_name,party,votes,total_votes,vote_share,winning_votes,winner,incumbent
0,"GAINES, EDWARD T",1265444,2016,S,1,722100.8,53889.23,68719.09,0.0,222241.57,...,10.39513,12.750864,Ted Gaines,REP,287314,448816,64.01598873480447,287314,1,1
1,"ROWEN, ROBERT J.",1383735,2016,S,1,16827.47,100.0,640.18,5128.0,400.0,...,0.0,8.743241,Rob Rowen,DEM,161502,448816,35.984011265195534,287314,0,0
2,"DODD, BILL",1359048,2016,S,3,4098682.81,181669.96,397477.07,886875.0,206297.6,...,11.212552,14.329722,Bill Dodd,DEM,207927,357628,58.14058183363719,207927,1,0
3,"YAMADA, MARIKO M.",1295701,2016,S,3,581241.54,5311.44,147757.78,500.0,258833.45,...,9.371609,12.929756,Mariko Yamada,DEM,149701,357628,41.85941816636281,207927,0,0
4,"GALGIANI, CATHLEEN",1273495,2016,S,5,1914684.12,47866.5,536197.19,577.46,374323.28,...,9.687195,13.773609,Cathleen Galgiani,DEM,174847,308451,56.68550272166406,174847,1,1


## Race totals

In [32]:
cand_exp_votes['race_total'] = cand_exp_votes.groupby(['office','district','year']).transform(sum)['TOT']
cand_exp_votes['opposing_total'] = cand_exp_votes['race_total'] - cand_exp_votes['TOT']

In [33]:
data = cand_exp_votes[(cand_exp_votes['party'] == 'DEM') | (cand_exp_votes['party'] == 'REP')]

In [34]:
# Was the race contested?
data['contested'] = data.duplicated(subset=['year','office','district'], keep=False).astype(int)
# How many contested races?
print('Total     : {}'.format(len(data)))
print('Contested : {}'.format(len(data[data['contested'] == True])))

Total     : 1518
Contested : 1438


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
data.to_csv('../data/all.csv', index=False)

In [62]:
data['duplicated'] = data.duplicated(['candidate_name','office','district'], keep=False).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
duplicates = data[data['duplicated'] == 1]
len(duplicates)

741

In [64]:
def find_opposing_candidate(row):
    name = data[(data['year'] == row['year']) & (data['office'] == row['office']) \
               & (data['district'] == row['district']) & \ 
                (data['candidate_name'] != row['candidate_name'])]['candidate_name']
    return name.iloc[0]

In [65]:
contested = data[data['contested'] == 1]
contested['opposing_candidate'] = contested.apply(find_opposing_candidate, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [66]:
contested.head()

Unnamed: 0,candidate_name,candidate_id,year,office,district,TOT,FND,COMM,ADS,INFO,...,total_votes,vote_share,winning_votes,winner,incumbent,race_total,opposing_total,contested,duplicated,opposing_candidate
0,"GAINES, EDWARD T",1265444,2016,S,1,722100.8,53889.23,68719.09,100.0,222241.57,...,448816,64.01598873480447,287314,1,1,738928.0,16827.5,1,1,"ROWEN, ROBERT J."
1,"ROWEN, ROBERT J.",1383735,2016,S,1,16827.47,100.0,640.18,5128.0,400.0,...,448816,35.984011265195534,287314,0,0,738928.0,722101.0,1,0,"GAINES, EDWARD T"
2,"DODD, BILL",1359048,2016,S,3,4098682.81,181669.96,397477.07,886875.0,206297.6,...,357628,58.14058183363719,207927,1,0,4679920.0,581242.0,1,0,"YAMADA, MARIKO M."
3,"YAMADA, MARIKO M.",1295701,2016,S,3,581241.54,5311.44,147757.78,500.0,258833.45,...,357628,41.85941816636281,207927,0,0,4679920.0,4098680.0,1,0,"DODD, BILL"
4,"GALGIANI, CATHLEEN",1273495,2016,S,5,1914684.12,47866.5,536197.19,577.46,374323.28,...,308451,56.68550272166406,174847,1,1,2140790.0,226105.0,1,1,"NAKANISHI, ALAN S."


In [67]:
contested['repeat_challenger'] = contested.duplicated(['candidate_name','office','opposing_candidate'], keep=False).astype(int)
contested[contested['repeat_challenger'] == 1].sort_values('candidate_name').head(n=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,candidate_name,candidate_id,year,office,district,TOT,FND,COMM,ADS,INFO,...,vote_share,winning_votes,winner,incumbent,race_total,opposing_total,contested,duplicated,opposing_candidate,repeat_challenger
1418,"ALDANA JR., MANUEL",1239991,2004,A,46,13782.00,4930.12,2725.00,100.00,100.00,...,14.954109183887649,44570,0,0,971375,957593,1,1,"NUNEZ, FABIAN",1
1656,"ALDANA JR., MANUEL",1239991,2002,A,46,100.00,100.00,100.00,100.00,100.00,...,13.68564545,27227,0,0,920743,920643,1,1,"NUNEZ, FABIAN",1
491,"ALEJO, LUIS",1318708,2012,A,30,430343.13,44271.36,100.00,24010.58,60582.40,...,65.36634922732566,79141,1,0,450128,19785.3,1,1,"BERNOSKY, ROBERT E.",1
699,"ALEJO, LUIS",1318708,2010,A,28,545900.90,19277.81,20357.34,313558.47,5047.50,...,62.7761240795864,56098,1,0,546001,100,1,0,"BERNOSKY, ROBERT E.",1
1278,"ALQUIST, ELAINE",1004827,2004,S,13,1226849.82,148993.18,152939.66,183803.70,232445.60,...,68.53448843,156321,1,0,1.22705e+06,200,1,1,"CONNOLLY, SHANE PATRICK",1
829,"ALQUIST, ELAINE",1004827,2008,S,13,506250.50,107619.23,746.78,4587.00,58800.00,...,70.9355661869397,179855,1,1,506450,200,1,1,"CONNOLLY, SHANE PATRICK",1
584,"ATKINS, TONI G.",1314678,2012,A,78,803030.35,41767.80,50595.06,2610.00,148261.97,...,62.38874104728739,116987,1,0,811188,8157.79,1,1,"DENNEY, RALPH",1
804,"ATKINS, TONI G.",1314678,2010,A,76,643420.60,19137.62,62459.48,2028.00,266759.79,...,57.680296373405994,75357,1,0,667767,24346.1,1,0,"DENNEY, RALPH",1
1640,"AYAO, ELY DE LA CRUZ",1240984,2002,A,39,100.00,100.00,100.00,100.00,100.00,...,23.90287695,36449,0,0,484198,484098,1,1,"MONTANEZ, CINDY",1
1404,"AYAO, ELY DE LA CRUZ",1240984,2004,A,39,100.00,100.00,100.00,100.00,100.00,...,23.214946609460885,56017,0,0,488132,488032,1,1,"MONTANEZ, CINDY",1


In [68]:
contested.to_csv('data/contested.csv', index=False)

## Repeat candidates

In [69]:
identifier_cols = ['candidate_name','office','district','party']
exp_cols = ['CNS','FND','LIT','OFC','PET','PHO','POL','PRT','RAD','SAL','TEL','PRO','WEB']
pct_cols = [exp + '_pct' for exp in exp_cols]
data_cols = ['vote_share','TOT','opposing_total'] + pct_cols

differenced = pd.DataFrame(columns=(identifier_cols + data_cols))

def difference(df):
    """
    Takes a group of observations with the same name and office and
    and adds any *consecutive* repeat elections to a DataFrame.
    """
    
    def compute_difference(pair):
        """
        Compute the difference in expenditures and outcome
        between two consecutive elections.
        """
        
        year1 = pair.iloc[0]['year']
        year2 = pair.iloc[1]['year']
        
        incumbent1 = pair.iloc[0]['incumbent']
        
        if pair.iloc[0]['office'] == 'A' and (year1 - year2 == 2):
            data = pair.iloc[0][data_cols] - pair.iloc[1][data_cols]
            
            data = data.append(pair.iloc[0][identifier_cols])
            data['year1'] = year1
            data['year2'] = year2
            
            data['TOT1']= pair.iloc[0]['TOT']
            data['TOT2']= pair.iloc[1]['TOT']
            
            data['opposing_total1']= pair.iloc[0]['opposing_total']
            data['opposing_total2']= pair.iloc[1]['opposing_total']
            
            data['incumbent1'] = pair.iloc[0]['incumbent']
            data['incumbent2'] = pair.iloc[1]['incumbent']
            
            global differenced
            differenced = differenced.append(data, ignore_index=True)
        
    df = df.sort_values('year', ascending=False)
    
    if len(df) > 1:
        compute_difference(df.iloc[0:2])
    if len(df) > 2:
        compute_difference(df.iloc[1:3])
    if len(df) > 3:
        compute_difference(df.iloc[2:4])
    
    return None

In [70]:
data['vote_share'] = data['vote_share'].astype(float)
#data[data['contested'] == 1].groupby(['candidate_id','office']).apply(difference)
data.groupby(['candidate_id','office']).apply(difference)
differenced.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,candidate_name,office,district,party,vote_share,TOT,opposing_total,CNS_pct,FND_pct,LIT_pct,...,PRO_pct,WEB_pct,TOT1,TOT2,incumbent1,incumbent2,opposing_total1,opposing_total2,year1,year2
0,"CALDERON, CHARLES M.",A,58.0,DEM,-1.4986,162575.37,0.0,,,,...,,,1079675.44,917100.07,1,1,100.0,100.0,2010.0,2008.0
1,"CALDERON, CHARLES M.",A,58.0,DEM,0.954872,80923.58,0.0,,,,...,,,917100.07,836176.49,1,0,100.0,100.0,2008.0,2006.0
2,"LESLIE, R. TIM",A,4.0,REP,0.116988,496683.68,0.0,,,,...,,,504978.53,8294.85,1,1,100.0,100.0,2004.0,2002.0
3,"KARNETTE, BETTY",A,54.0,DEM,7.274579,-1494454.78,-1025161.27,,,,...,,,357810.03,1852264.81,1,0,100.0,1025261.27,2006.0,2004.0
4,"FOSTER, BEA",A,71.0,DEM,3.959265,0.0,259388.64,,,,...,,,100.0,100.0,0,0,612298.34,352909.7,2004.0,2002.0


In [71]:
print(len(differenced))
differenced.to_csv('data/differenced.csv', index=False)

425
