In [60]:
import psycopg2
import numpy as np
import pandas as pd

In [82]:
df = pd.read_csv('../data/all.csv')
# Only Assembly candidates
df = df[df['office'] == 'A']
# In contested races
df = df[df['contested'] == 1]

In [83]:
df.head()

Unnamed: 0,candidate_name,candidate_id,year,office,district,TOT,FND,COMM,ADS,INFO,...,party,votes,total_votes,vote_share,winning_votes,winner,incumbent,race_total,opposing_total,contested
41,"GALLAGHER, JAMES",1357069,2016,A,3,595937.61,70177.14,12917.05,2900.0,107178.32,...,REP,108910,172777,63.03501,108910,1,1,666466.05,70528.44,1
42,"RITCHIE, EDWARD H",1383126,2016,A,3,70528.44,1600.0,14003.45,0.0,14000.0,...,DEM,63867,172777,36.96499,108910,0,0,666466.05,595937.61,1
43,"AGUIAR-CURRY, CECILIA",1379566,2016,A,4,752510.86,57234.59,145978.52,5013.05,80725.0,...,DEM,118772,186942,63.534144,118772,1,0,797275.43,44764.57,1
44,"SCHAUPP, CHARLES E.",1301596,2016,A,4,44764.57,0.0,3503.62,975.0,11000.0,...,REP,68170,186942,36.465856,118772,0,0,797275.43,752510.86,1
45,"BIGELOW, FRANK",1342402,2016,A,5,1399553.97,39422.49,56974.95,104229.64,10042.26,...,REP,121644,188593,64.500803,121644,1,1,1399553.97,0.0,1


In [84]:
exp_cols = ['LOGTOT','LOGFND','LOGCOMM','LOGADS','LOGINFO','LOGOVERHEAD','LOGCONTRIB','LOGGENERAL']

In [88]:
races = pd.DataFrame()

for name, grouped in df.groupby(['district','year']):
    row = {}
    row['district'] = name[0]
    row['year'] = name[1]
    incumbent = grouped[grouped['incumbent'] == 1]
    
    if incumbent.empty:
        dem = grouped[grouped['party'] == 'DEM']
        rep = grouped[grouped['party'] == 'REP']
        
        if len(dem) != 1 or len(rep) != 1:
            continue
        
        dem = dem.iloc[0]
        rep = rep.iloc[0]
        
        row['dem_vote'] = dem['vote_share']
        row['dem_name'] = dem['candidate_name']
        row['rep_name'] = rep['candidate_name']
        row['incumbent'] = 0
        
        for col in exp_cols:
            row['open_' + col] = dem[col] - rep[col]
    else:
        try:
            incumbent = incumbent.iloc[0]
            challenger = grouped[grouped['incumbent'] == 0].iloc[0]
            for col in exp_cols:
                row['incumbent_' + col] = incumbent[col]
                row['challenger_' + col] = challenger[col]
                
            if incumbent['party'] == 'DEM':
                row['dem_vote'] = incumbent['vote_share']
                row['incumbent'] = 1
                row['dem_name'] = incumbent['candidate_name']
                row['rep_name'] = challenger['candidate_name']
            else:
                row['dem_vote'] = challenger['vote_share']
                row['incumbent'] = -1
                row['dem_name'] = challenger['candidate_name']
                row['rep_name'] = incumbent['candidate_name']
        except:
            pass
    races = races.append(row, ignore_index=True)

In [89]:
races.head(n=20)

Unnamed: 0,dem_name,dem_vote,district,incumbent,open_LOGADS,open_LOGCOMM,open_LOGCONTRIB,open_LOGFND,open_LOGGENERAL,open_LOGINFO,...,challenger_LOGOVERHEAD,challenger_LOGTOT,incumbent_LOGADS,incumbent_LOGCOMM,incumbent_LOGCONTRIB,incumbent_LOGFND,incumbent_LOGGENERAL,incumbent_LOGINFO,incumbent_LOGOVERHEAD,incumbent_LOGTOT
0,"BERG, PATTY",48.519756,1.0,0.0,3.774325,1.087902,7.517521,-0.654598,1.387931,1.816492,...,,,,,,,,,,
1,"BERG, PATTY",61.44221,1.0,1.0,,,,,,,...,0.0,0.0,11.422845,10.781044,11.655726,10.880901,12.237147,9.523325,11.441491,12.963296
2,"BERG, PATTY",64.740974,1.0,1.0,,,,,,,...,0.0,0.0,4.60517,7.859757,12.142334,11.133727,11.213601,8.006368,10.617812,12.652464
3,"CHESBRO, WESLEY",70.783007,1.0,0.0,11.437775,10.887677,11.902803,11.435481,12.424489,9.21034,...,,,,,,,,,,
4,"CHESBRO, WESLEY",61.535474,1.0,1.0,,,,,,,...,9.377464,10.953487,12.00351,8.25682,11.622139,11.716772,12.576918,0.0,11.600413,13.158832
5,"SMITH, BRIGHAM SAWYER",29.765487,1.0,-1.0,,,,,,,...,0.0,0.0,9.399868,9.823887,11.70911,10.840674,12.260787,11.772482,11.637373,13.090133
6,"KINYON, DOUGLAS J",29.287162,2.0,0.0,-10.227311,-12.140721,-8.216088,-9.346902,-12.521355,-10.772061,...,,,,,,,,,,
7,"MC IVER, BARBARA G.",35.134665,2.0,-1.0,,,,,,,...,9.748533,11.808039,11.412949,11.117184,10.798082,11.171209,12.701702,11.507384,11.495184,13.285304
8,"SMITH, MEL",29.458406,2.0,-1.0,,,,,,,...,6.913598,10.599506,10.025492,6.446862,11.384111,11.598762,12.190836,11.078863,11.318409,13.606732
9,"SINGH, PAUL R.",34.601099,2.0,0.0,-11.627626,-11.749787,-10.423768,-11.055056,-12.99745,-11.84326,...,,,,,,,,,,


In [91]:
races['incumbent'] = races['incumbent'].astype(int)
races['district'] = races['district'].astype(int)
races['year'] = races['year'].astype(int)

In [92]:
votes = pd.read_csv('../data/votes.csv')
votes = votes[(votes['office'] == 'A') & (votes['party'] == 'DEM')]
votes = votes.groupby(['district','year'])['vote_share'].sum().reset_index()
votes = votes[['district','vote_share']].groupby('district').mean().reset_index()

In [93]:
votes = votes.rename(columns={'vote_share': 'avg_vote'})
votes.head()

Unnamed: 0,district,avg_vote
0,1,56.131151
1,2,52.360746
2,3,38.684087
3,4,46.542772
4,5,37.511392


In [94]:
races = races.merge(votes, on='district')


In [96]:
races = races.fillna(0)

In [97]:
races.head()

Unnamed: 0,dem_name,dem_vote,district,incumbent,open_LOGADS,open_LOGCOMM,open_LOGCONTRIB,open_LOGFND,open_LOGGENERAL,open_LOGINFO,...,challenger_LOGTOT,incumbent_LOGADS,incumbent_LOGCOMM,incumbent_LOGCONTRIB,incumbent_LOGFND,incumbent_LOGGENERAL,incumbent_LOGINFO,incumbent_LOGOVERHEAD,incumbent_LOGTOT,avg_vote
0,"BERG, PATTY",48.519756,1,0,3.774325,1.087902,7.517521,-0.654598,1.387931,1.816492,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.131151
1,"BERG, PATTY",61.44221,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.422845,10.781044,11.655726,10.880901,12.237147,9.523325,11.441491,12.963296,56.131151
2,"BERG, PATTY",64.740974,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.60517,7.859757,12.142334,11.133727,11.213601,8.006368,10.617812,12.652464,56.131151
3,"CHESBRO, WESLEY",70.783007,1,0,11.437775,10.887677,11.902803,11.435481,12.424489,9.21034,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.131151
4,"CHESBRO, WESLEY",61.535474,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,10.953487,12.00351,8.25682,11.622139,11.716772,12.576918,0.0,11.600413,13.158832,56.131151


In [98]:
races.to_csv('../data/races.csv', index=False)