In [1]:
from congress import Congress
import pandas as pd
import numpy as np 
import os
os.chdir('C:\\Users\\Sanata\\Dropbox\\01A_Data Science Project')
pd.options.display.max_rows = 25

From the ProPublica Congress API, I append datasets of Congress House members from the 106th to the 115th (1999-2017)

In [2]:
#read in datasets by congress number
API_KEY = "RuHEFYkWVn9DyoEGn3QfJX9s2cMLCq3Y2pzV7Kww"
congress = Congress(API_KEY)

list_dfs = []
for session in range(106, 116):
    members =  congress.members.filter('house', congress=session)[0] # a dict of 4 
    df = pd.DataFrame(members['members']) #a dataframe, obs are each member of congress
    df['cong_num'] = members['congress'] #get the congress session number
    list_dfs.append(df)
#append all datasets into one 
data = pd.concat(list_dfs)
#select only representatives 
data = data[data.title=='Representative']

In [3]:
data.head()

Unnamed: 0,api_uri,at_large,cong_num,contact_form,crp_id,cspan_id,date_of_birth,district,dw_nominate,facebook_account,...,state,suffix,title,total_present,total_votes,twitter_account,url,votes_with_party_pct,votesmart_id,youtube_account
0,https://api.propublica.org/congress/v1/members...,False,106,,,,1938-06-26,1,,,...,HI,,Representative,5.0,1214.0,neilabercrombie,,90.06,,hawaiirep1
1,https://api.propublica.org/congress/v1/members...,False,106,,,1002061.0,1942-11-19,5,,,...,NY,,Representative,2.0,1214.0,repgaryackerman,,94.29,,RepAckerman
2,https://api.propublica.org/congress/v1/members...,False,106,,N00003028,45516.0,1965-07-22,4,,RobertAderholt,...,AL,,Representative,0.0,1214.0,Robert_Aderholt,https://aderholt.house.gov,88.73,441.0,RobertAderholt
3,https://api.propublica.org/congress/v1/members...,False,106,,,,1945-04-16,1,,,...,ME,,Representative,1.0,1214.0,,,94.21,,
4,https://api.propublica.org/congress/v1/members...,False,106,,,19670.0,1957-08-04,1,,,...,NJ,,Representative,0.0,1214.0,RepAndrews,http://andrews.house.gov/,89.95,,


Data Cleaning 

After checking the variable types, I clean the dataset 

In [4]:
data.dtypes

api_uri                  object
at_large                   bool
cong_num                 object
contact_form             object
crp_id                   object
cspan_id                 object
date_of_birth            object
district                 object
dw_nominate              object
facebook_account         object
fax                      object
fec_candidate_id         object
                         ...   
seniority                object
short_title              object
state                    object
suffix                   object
title                    object
total_present           float64
total_votes             float64
twitter_account          object
url                      object
votes_with_party_pct    float64
votesmart_id             object
youtube_account          object
Length: 45, dtype: object

In [5]:
#change date_of_birth to date object 
data['date_of_birth'] =  pd.to_datetime(data['date_of_birth']) 
#add start and end dates 
s = list(np.arange(1999, 2019, 2).astype('str')) #make a string of start dates 
sdates = pd.DataFrame({'start_date': [item + "-01-01" for item in s], 
                        'cong_num': data.cong_num.unique()}
                      )
data = data.merge(sdates, how='inner', on = 'cong_num')
data['start_date'] = pd.to_datetime(data['start_date'])
#calculate age (from session start date)
data['age'] = ((data.start_date - data.date_of_birth).dt.days / 365).round()
#seniority to numeric 
data['seniority'] = data['seniority'].astype('int64')
#fix dw_nom (replace 'None', 'nan' with NaN)
data['dw_nominate'].replace(['None', 'nan'], np.nan, inplace=True)
#make cong_num numeric 
data['cong_num'] = data['cong_num'].astype('int64')

##find special elections 
def get_special_elections(df):
    """
    Finds districts with special elections 
    Returns a list of indexes where there was a special election  
    """ 
    #groupby the data frame's state/district
    by_dist = df.groupby(['state','district']).size()                
    #Find where >1 rep per state/district and convert to series 
    spec_elect_tups = pd.Series( by_dist.loc[by_dist > 1].index.values ) 
    #create series of all state-district pairs 
    all_state_dist_pairs = pd.Series( list(zip(df.state, df.district)) )
    #find where the special election indexes are 
    check_spec = all_state_dist_pairs.isin(spec_elect_tups).tolist()
    spec_elec_loc = df.loc[check_spec].index.values.tolist()
    #return a list of indexes where there was a special election
    return spec_elec_loc 
#apply function to every congress session, return a list of indexes      
spec_elect_idx = sum([ get_special_elections(data[data.cong_num==x]) for x in data.cong_num.unique().tolist() ], [] )
#set new special election variable to 1 in district-years where there was a special election 
data['spec_elect'] = 0
data.loc[spec_elect_idx, 'spec_elect'] = 1

#add president party in power 
party_yr = {'rep_pres': [2001, 2003, 2005, 2007, 2017], 
              'dem_pres':[1999, 2009, 2011, 2013, 2015]}
data['pres_party'] = None 
data.loc[ data.start_date.dt.year.isin(party_yr['rep_pres']), 'pres_party' ] = 'R' 
data.loc[ data.start_date.dt.year.isin(party_yr['dem_pres']), 'pres_party' ] = 'D' 
#add president party match 
data['pres_party_match'] = (data.party==data.pres_party)
#add midterm election year indicator 
mid_yrs = [1999, 2003, 2007, 2011, 2015]
data['midterm_yr'] = 0 
data.loc[ data.start_date.dt.year.isin(mid_yrs), 'midterm_yr' ] = 1
#add year 
data['start_yr'] = data.start_date.dt.year
#add last term indicator: for each rep, find their max seniority
by_id = data.groupby('id')
where_max_seniority = by_id['seniority'].apply(np.argmax)
data['last_term'] = 0 
data.loc[where_max_seniority, 'last_term'] = 1
data.loc[data.start_yr==2017, 'last_term'] = None 
#leadership role to binary 
data['leadership_bin'] = data.leadership_role.apply(lambda x: 1 if pd.notnull(x) else 0 )
#save edits 
#data.to_pickle('congress_members_pickle')

In [6]:
data.head()

Unnamed: 0,api_uri,at_large,cong_num,contact_form,crp_id,cspan_id,date_of_birth,district,dw_nominate,facebook_account,...,youtube_account,start_date,age,spec_elect,pres_party,pres_party_match,midterm_yr,start_yr,last_term,leadership_bin
0,https://api.propublica.org/congress/v1/members...,False,106,,,,1938-06-26,1,,,...,hawaiirep1,1999-01-01,61.0,0,D,True,1,1999,0.0,0
1,https://api.propublica.org/congress/v1/members...,False,106,,,1002061.0,1942-11-19,5,,,...,RepAckerman,1999-01-01,56.0,0,D,True,1,1999,0.0,0
2,https://api.propublica.org/congress/v1/members...,False,106,,N00003028,45516.0,1965-07-22,4,,RobertAderholt,...,RobertAderholt,1999-01-01,33.0,0,D,False,1,1999,0.0,0
3,https://api.propublica.org/congress/v1/members...,False,106,,,,1945-04-16,1,,,...,,1999-01-01,54.0,0,D,True,1,1999,0.0,0
4,https://api.propublica.org/congress/v1/members...,False,106,,,19670.0,1957-08-04,1,,,...,,1999-01-01,41.0,0,D,True,1,1999,0.0,0
