In [1]:
# Import Libraries 
import pandas as pd
import numpy as np
from fuzzy_match import match
import warnings
warnings.simplefilter('ignore')

# Settings
pd.set_option('max_columns', None)

### Clean Endorsements Data

In [2]:
# Read in Primary Candidates 2018 data
dem = pd.read_csv('dem_candidates.csv', error_bad_lines=False, encoding='latin-1')
rep = pd.read_csv('rep_candidates.csv', error_bad_lines=False, encoding='latin-1')

# Remove candidates not running in national elections 
rep = rep[rep['Office Type'] != 'Governor']
dem = dem[dem['Office Type'] != 'Governor']

# Set column for party affiliations 
dem['Party'] = 'D'
rep['Party'] = 'R'

# Fill values for partisan lean for Republican candidates
partisan_lean_by_district = dict(zip(dem['District'], dem['Partisan Lean']))
rep['Partisan Lean'] = rep['District'].map(partisan_lean_by_district)
prop_filled_v1 = len(rep['Partisan Lean'].dropna())/len(rep['Partisan Lean'])

# Impute missing partisan lean for Republican candidates based on Senate, then average of House districts within state
senate_leans = dem[dem['District'].str.contains('Senate')][['State', 'Partisan Lean']].drop_duplicates()
partisan_lean_by_state = dict(zip(senate_leans['State'], senate_leans['Partisan Lean']))
house_leans = dem[['Candidate','State','District','Partisan Lean']][
    dem['District'].str.contains('House')].groupby('State').mean().reset_index().dropna()
for state, lean in zip(house_leans['State'], house_leans['Partisan Lean']):
    if state not in partisan_lean_by_state.keys():
        partisan_lean_by_state[state] = lean
for index, row in rep.iterrows():
    state = rep.loc[index, 'State']
    if (pd.isna(rep.loc[index, 'Partisan Lean'])) and (state in partisan_lean_by_state):
        rep.loc[index, 'Partisan Lean'] = partisan_lean_by_state[state]
prop_filled_v2 = len(rep['Partisan Lean'].dropna())/len(rep['Partisan Lean'])

# Identify the states with Republican candidates who still have missing partisan leans 
missing_state_rep = rep[rep['Partisan Lean'].isna()]['State'].value_counts().index.values

# Select the Republican candidates who have missing partisan leans 
invalid_rep_cand = rep[rep['State'].isin(missing_state_rep)]

# Select the Republican candidates who do not have missing partisan leans 
valid_rep_cand = rep[~(rep['State'].isin(missing_state_rep))]

# Deal with missing values in endorsement data
final_dem = dem
final_rep = valid_rep_cand
dem_endorse_support_cols = [x for x in final_dem.columns if 'Support' in x or 'Endorsed' in x]
rep_endorse_support_cols = [x for x in final_rep.columns if 'Support' in x or 'Endorsed' in x]
endorse_map = {'Yes': 1, 'No': -1, np.nan: 0}
final_dem.loc[:, dem_endorse_support_cols] = final_dem.loc[:, dem_endorse_support_cols].replace(endorse_map)
final_rep.loc[:, rep_endorse_support_cols] = final_rep.loc[:, rep_endorse_support_cols].replace(endorse_map)
party_support = np.hstack([final_dem['Party Support?'].values, final_rep['Rep Party Support?'].values])
full_candidates = pd.concat([final_dem, final_rep]).drop(columns=['Party Support?', 'Rep Party Support?'])
full_candidates['Party Support?'] = party_support
cand_endorse_support_cols = [x for x in full_candidates.columns if 'Support' in x or 'Endorsed' in x]
full_candidates.loc[:, cand_endorse_support_cols] = full_candidates.loc[:, cand_endorse_support_cols].fillna(-1)
full_candidates = full_candidates.drop(columns=['General Status', 'Race', 'Veteran?',
                                                'LGBTQ?', 'Elected Official?', 'Self-Funder?',
                                               'STEM?', 'Obama Alum?', 'Guns Sense Candidate?',
                                               'Won Primary'])

# Set district numbers 
full_candidates['District_Num'] = [int(x[-1]) if len(x[-1])<=2 else 0 for x in full_candidates['District'].str.split()]

### Clean FEC Campaigner Dataset

In [3]:
# Read in FEC All Candidates data
fin16 = pd.read_csv('weball16.txt', sep="|", header=None)
fin18 = pd.read_csv('weball18.txt',  sep="|", header=None)

# Set column names  
fin_col = ['CAND_ID', 'CAND_NAME', 'CAND_ICI', 'PTY_CD', 'CAND_PTY_AFFILIATION',
           'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'TTL_DISB', 'TRANS_TO_AUTH', 'COH_BOP', 
           'COH_COP', 'CAND_CONTRIB', 'CAND_LOANS', 'OTHER_LOANS', 'CAND_LOAN_REPAY', 
           'OTHER_LOAN_REPAY', 'DEBTS_OWED_BY', 'TTL_INDIV_CONTRIB', 'CAND_OFFICE_ST', 
           'CAND_OFFICE_DISTRICT', 'SPEC_ELECTION', 'PRIM_ELECTION', 'RUN_ELECTION', 
           'GEN_ELECTION', 'GEN_ELECTION_PRECENT', 'OTHER_POL_CMTE_CONTRIB',
           'POL_PTY_CONTRIB', 'CVG_END_DT', 'INDIV_REFUNDS', 'CMTE_REFUNDS']
fin16.columns = fin_col
fin18.columns = fin_col

# Set year
fin18['YEAR'] = 18
fin16['YEAR'] = 16

# Combine datasets 
fin = pd.concat([fin16, fin18])

# Reformat party names 
fin['CAND_PTY_AFFILIATION'] = fin['CAND_PTY_AFFILIATION'].map({'REP':'R',
                                                               'DEM':'D'})

# Reformat candidate names  
fin['CAND_NAME'] = [" ".join(n.lower().title().split(", ")[::-1]) for n in fin['CAND_NAME']]

# Remove missing district numbers and re-code data 
fin = fin[~fin['CAND_OFFICE_DISTRICT'].isna()]
fin['CAND_OFFICE_DISTRICT']= fin['CAND_OFFICE_DISTRICT'].astype(int)

### Merge Endorsements Data with FEC Campaigner Data

In [17]:
# Fuzzy match names 
fuzzy_name = [match.extract(x, fin['CAND_NAME'], limit=1, score_cutoff=0.55) for x in full_candidates['Candidate'].values]
fuzzy_name_match = [x[0][0] if x!=[] else '' for x in fuzzy_name]
fuzzy_name_match_score = [x[0][1] if x!=[] else '' for x in fuzzy_name]
full_candidates['fuzzy_name_match'] = fuzzy_name_match
full_candidates['fuzzy_name_match_score'] = fuzzy_name_match_score

# Merge datasets based on fuzzy matched names, state, and party and drop duplicates running in special elections
cand_fin = pd.merge(fin[['CAND_ID', 'CAND_NAME', 'CAND_OFFICE_DISTRICT', 
                         'CAND_OFFICE_ST', 'CAND_ICI', 'PTY_CD', 
                         'CAND_PTY_AFFILIATION', 'TTL_RECEIPTS', 
                        'TRANS_FROM_AUTH','CAND_CONTRIB',
                        'TTL_INDIV_CONTRIB','OTHER_POL_CMTE_CONTRIB']], 
                    full_candidates, 
                    left_on=['CAND_NAME','CAND_OFFICE_ST', 'CAND_PTY_AFFILIATION'] , 
                    right_on=['fuzzy_name_match', 'State', 'Party'])
cand_fin = cand_fin[['Candidate', 'CAND_ID', 'District', 'Party', 'State', 'Emily Endorsed?',
                     'Partisan Lean', 'TTL_RECEIPTS', 'TRANS_FROM_AUTH', 'CAND_CONTRIB',
                     'TTL_INDIV_CONTRIB', 'OTHER_POL_CMTE_CONTRIB', 'Primary %',
                     'Biden Endorsed?', 'Warren Endorsed? ', 'Sanders Endorsed?',
                     'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 
                     'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?',
                     'VoteVets Endorsed?', 'No Labels Support?', 'Party Support?', 
                     'Trump Endorsed?', 'Bannon Endorsed?', 'Great America Endorsed?',
                     'NRA Endorsed?', 'Right to Life Endorsed?', 'Susan B. Anthony Endorsed?',
                     'Club for Growth Endorsed?', 'Koch Support?', 'House Freedom Support?',
                     'Tea Party Endorsed?', 'Main Street Endorsed?', 'Chamber Endorsed?', 'Race Type', 'CAND_ICI']]

# Find duplicate candidates who are listed as both running open-seat elections and challengers, only consider challengers 
dupes = cand_fin.groupby('Candidate').count()[['CAND_ICI']].reset_index()
dupe_names = dupes[dupes['CAND_ICI'] > 1]['Candidate'].values
cand_fin = cand_fin[~(cand_fin['Candidate'].isin(dupe_names)) | (
    cand_fin['Candidate'].isin(dupe_names) & cand_fin['CAND_ICI'] == 'C')]
cand_fin.head()

Unnamed: 0,Candidate,CAND_ID,District,Party,State,Emily Endorsed?,Partisan Lean,TTL_RECEIPTS,TRANS_FROM_AUTH,CAND_CONTRIB,TTL_INDIV_CONTRIB,OTHER_POL_CMTE_CONTRIB,Primary %,Biden Endorsed?,Warren Endorsed?,Sanders Endorsed?,Our Revolution Endorsed?,Justice Dems Endorsed?,PCCC Endorsed?,Indivisible Endorsed?,WFP Endorsed?,VoteVets Endorsed?,No Labels Support?,Party Support?,Trump Endorsed?,Bannon Endorsed?,Great America Endorsed?,NRA Endorsed?,Right to Life Endorsed?,Susan B. Anthony Endorsed?,Club for Growth Endorsed?,Koch Support?,House Freedom Support?,Tea Party Endorsed?,Main Street Endorsed?,Chamber Endorsed?,Race Type,CAND_ICI
6,Rita Ramirez,H8CA41139,U.S. House California District 8,D,CA,0.0,-17.33,35743.12,0.0,0.0,5361.08,0.0,27.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Regular,C
23,Patrick Malloy,H6CA50258,U.S. House California District 50,D,CA,0.0,-19.48,27447.89,0.0,200.0,17219.66,9596.63,16.129999,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Regular,C
35,Mark Leyva,H8IN01096,U.S. House Indiana District 1,R,IN,-1.0,-29.125333,199.0,0.0,0.0,199.0,0.0,26.99,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Regular,C
71,Freddy Horne,H6NV01224,U.S. House Nevada District 1,R,NV,-1.0,0.95,73862.55,0.0,0.0,40466.28,0.0,44.82,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Regular,C
98,Robert Klepinger,H4OH10167,U.S. House Ohio District 10,D,OH,0.0,-8.479999,8434.5,0.0,4081.0,3252.0,1050.0,25.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Regular,C


In [19]:
cand_fin.CAND_ICI.value_counts()

C    468
O    286
I      2
Name: CAND_ICI, dtype: int64

### Clean FEC Individual Contributions Data

In [20]:
dupe_names

array(['Alan LaPolice', 'Amanda Howland', 'Amie Hoeber',
       'Annette Teijeiro', 'Art Halvorson', 'Art Robinson', 'Ava Pate',
       "Beto O'Rourke", 'Beverly Goldstein', 'Bob Stump',
       'Bobby Mahendra', 'Brad Ashford', 'Brenda Jones', 'Bryan Caforio',
       "Carol O'Brien", 'Craig Keller', 'Cresent Hardy', 'Dale Mensing',
       'Danny Tarkanian', 'David McDevitt', 'David Trone',
       'Dennis Crawford', 'Douglas Applegate', 'DuWayne Gregory',
       'E.W. Jackson', 'Ed Albertson', 'Evan Jenkins',
       'Francisco Canseco', 'Gary Wegman', 'Geoff Young',
       'Gretchen Driskell', 'Hosea Cleveland', 'Ian Conyers',
       'J.D. Miniear', 'Jackie Patton', 'Jacky Rosen', 'Jacob Turk',
       'James Cargas', 'James Singer', 'James Veltmeyer', 'Jan McDowell',
       'Jay Sidie', 'Jeff Jones', 'Jesse Sbaih', 'Jim Gray', 'Jim Walz',
       'Jo Rae Perkins', 'Jody Ball', 'John Cullum', 'John Horst',
       'John Russell', 'Junius Rodriguez', 'Justin Fareed', 'Kelli Ward',
       'K

### Clean FEC Campaign-Committee Linkages Data

### Merge Individual Contributions with Campaign-Committee Linkages

### Merge Candidate Endorse

In [None]:
full_candidates.columns