# House Expenditures Data cleaning 

This script creates and cleans data from ProPublica's House Expenditures dataset. Covering 2009Q3 to 2018Q1, the data contain over 3.5 million observations of detailed expense items for every lawmaker and office in the House of Representatives. I first combine the quarterly Excel files, then do some data cleaning to produce the final dataset I use in my analysis of House spending (separate notebook). 

In [1]:
import pandas as pd
import numpy as np 
import os
import glob
os.chdir("C:\\Users\\Sanata\\Dropbox\\01A_Data Science Project\\house-office-expenditures-with-readme")
pd.options.display.max_rows = 25

In [2]:
#read files from directory 
files =  glob.glob('*detail*.csv') 
#remove duplicate 2015Q2 file 
dups = [i for i, s in enumerate(files) if 'updated' in s]
to_drop =  [x+1 for x in dups] 
del files[to_drop[0]]
#concatenate files (about 35) to create dataset 
df = (pd.concat([pd.read_csv(f, encoding = 'latin1') for f in files], 
                keys = list(range(len(files))))
                .reset_index(level=0)
     )
df.head()
#df.shape
#df.dtypes

  if self.run_code(code, result):


Unnamed: 0,level_0,AMOUNT,BIOGUIDE_ID,CATEGORY,DATE,END DATE,OFFICE,PAYEE,PROGRAM,PURPOSE,QUARTER,RECIP (orig.),RECORDID,SORT SEQUENCE,START DATE,TRANSCODE,TRANSCODELONG,YEAR
0,0,16799.25,,OTHER SERVICES,,10/04/06,COMMUNICATIONS,07ÃÂ­01 P2 OPR0900726A S...,,NON-TECHNOLOGY SERVICE CONTRCT,2009Q3,07ÃÂ­01 P2 OPR0900726A S...,,,10/04/06,,,FISCAL YEAR 2009
1,0,3876.75,,OTHER SERVICES,,10/04/06,COMMUNICATIONS,07ÃÂ­22 P2 OPR0900726B ...,,NON-TECHNOLOGY SERVICE CONTRCT,2009Q3,07ÃÂ­22 P2 OPR0900726B ...,,,10/04/06,,,FISCAL YEAR 2009
2,0,2132.0,,OTHER SERVICES,,07/18/06,COMMUNICATIONS,08ÃÂ­06 P2 FSS0000575A T...,,NON-TECHNOLOGY SERVICE CONTRCT,2009Q3,08ÃÂ­06 P2 FSS0000575A T...,,,07/18/06,,,FISCAL YEAR 2009
3,0,888.0,,OTHER SERVICES,,05/29/09,COMMUNICATIONS,08ÃÂ­25 P2 MFP0003163 A...,,NON-TECHNOLOGY SERVICE CONTRCT,2009Q3,08ÃÂ­25 P2 MFP0003163 A...,,,05/29/09,,,FISCAL YEAR 2009
4,0,590.18,,OTHER SERVICES,,10/04/06,COMMUNICATIONS,09ÃÂ­10 P2 OPR0900726C S...,,NON-TECHNOLOGY SERVICE CONTRCT,2009Q3,09ÃÂ­10 P2 OPR0900726C S...,,,10/04/06,,,FISCAL YEAR 2009


In [3]:
#add year 
f = (pd.Series(files, name='f_year')
            .str[:4]
            .astype('int64')
    ) #getthe first 4 characters of the file names and convert to integer 
df = df.join(f, on = 'level_0') #assign new year variable to the data frame 
#reformat columns 
df.columns = (df.columns
                 .str.lower()
                 .str.replace(' ', '_')
              )


In [4]:
##validate dates 
def is_valid_date(string):
  
    if isinstance(string, str) is not True: 
        string = str(string)
    if r'[A-z]' in string: 
        return False 
    if (len(string) < 6) | (len(string) > 10): 
        return False 
    if r'/' not in string: 
        return False 
    return True 

df['start_date2'] = df.start_date  #keep record of original date variable 
df.loc[df.start_date.apply(is_valid_date)==False, 'start_date'] = None 
df['end_date2'] = df.end_date 
df.loc[df.end_date.apply(is_valid_date)==False, 'end_date'] = None 

In [5]:
#change start date 
df['start_date'] = pd.to_datetime(df.start_date2, format = '%m/%d/%y', errors = 'coerce')
mask = df.start_date.isnull()
df.loc[mask, 'start_date'] = pd.to_datetime(df[mask]['start_date2'], format='%m/%d/%Y',
                                             errors='coerce')
#change end date 
df['end_date'] = pd.to_datetime(df.end_date2, format = '%m/%d/%y', errors = 'coerce')
mask = df.end_date.isnull()
df.loc[mask, 'end_date'] = pd.to_datetime(df[mask]['end_date2'], format='%m/%d/%Y', errors='coerce')


In [6]:
##change amount 
df['amount2'] = df.amount
#set all values to string  
df.amount = df.amount.astype('str')
#find and remove letters 
found_abc = df.amount.str.contains(r'([A-z])', na=False)
df.loc[found_abc, 'amount'] = None 
#remove commas from amount and convert to numeric 
df.amount = (df.amount
                 .str.replace(',', '')
                 .astype('float64')
            )

In [None]:
#save 
df.to_pickle('congress_spending')