In [2]:
# Terraform an RDS instance: rds.tf
# Output the RDS connection props to a config file
# SQL script to make tables in RDS instance
# Python script to pull data, write to file, and copy to RDS instance

In [3]:
import csv
import json
import os
import sys
import time
from functools import partial

import requests

In [3]:
with open('api_key.txt') as f:
  API_KEY = f.read()
  
BASE_URL = 'https://api.open.fec.gov/v1'
DEFAULT_PARAMS = {
  'per_page': 100,
  'page': 1
}
CONFIGS = {
  'candidates': {
    'endpoint': 'candidates',
    'write_path': '~/candidates.csv',
    'attributes': ['candidate_id','name', 'cycles', 'district_number', 'election_districts', 'incumbent_challenge', 'party', 'state', 'office_full']
  },
  'committees': {
    'endpoint': 'committees',
    'write_path': '~/committees.csv',
    'attributes': ['committee_id', 'committee_type_full', 'designation_full', 'name', 'organization_type_full', 'state', 'party_full''committee_type_full']
  },
  'donations': {
    'endpoint': 'schedules/schedule_a',
    'write_path': '~/donations.csv',
    'attributes': ['contribution_receipt_date', 'contribution_receipt_amount', 'contributor_state', 'contributor_id', 'contributor_name', 'pdf_url', 'sub_id']
  },
  'filings': {
    'endpoint': 'candidate/{0}/filings',
    'attributes': '*'
  }
}

print(API_KEY)

DotDawmB1KyneOh1BjXbGws1MVuUDS3wstyqvJ9b



In [None]:
session = requests.Session()

In [5]:
def format_params(param, value):
  return_params = []
  if isinstance(value, list):
    for value_option in value:
      return_params.append('{}={}'.format(param, value_option))
  else:
    return_params.append('{}={}'.format(param, value))
  return return_params

def get_url_params(params):
  default_params = DEFAULT_PARAMS.copy()
  default_params.update(params)
  url_params = []
  for param, value in default_params.items():
    url_params.extend(format_params(param, value))
  return url_params

In [6]:
def get_url(base_url, endpoint, api_key, **kwargs):
  # Pass kwargs to add params to the URL for filters, etc.
  # See params for your endpoint in the docs: https://api.open.fec.gov/developers/
  url_params = get_url_params(kwargs.copy())
  endpoint = '/'.join([base_url, endpoint])
  query_url = endpoint + '?api_key={}&'.format(api_key) + '&'.join(url_params)
  return query_url

In [19]:
def get_data(object_type, **kwargs):
  endpoint = CONFIGS[object_type]['endpoint'].format(*kwargs.get('dynamic_endpoint', []))
  url = get_url(BASE_URL, endpoint, API_KEY, **kwargs)
  for i in range(100):
    try:
      response = session.get(url)
      time.sleep(10)
      return response.json()
    except:
      time.sleep(3600)
      continue
  raise

In [8]:
def format_record(record, attributes):
  return {key: value for key, value in record.items() if key in attributes}

In [9]:
def get_object_ids(id_key, records):
  return list(map(lambda record: record[id_key], records))

In [10]:
def format_results(response, attributes):
  results = response['results']
  return list(map(lambda record: format_record(record, attributes), results))

In [11]:
def write_results_to_file(object_type, writer, response, attributes):
  formatted_data = format_results(response, attributes)
  for record in formatted_data:
    writer.writerow(record)
  return formatted_data

In [20]:
def pull_and_write_data(object_type, sub_pull=None, sub_pull_arg=None, **kwargs):
  print("pulling {}".format(object_type))
  attributes = CONFIGS[object_type]['attributes']  
  file_path = CONFIGS[object_type]['write_path']      
  for i in range(100):
    try:
      response = get_data(object_type, **kwargs)
      pages = response['pagination']['pages']
      time.sleep(10)
      break
    except:
      if i == range(100)[-1]:
        raise
      time.sleep(3600)
      continue

  with open(os.path.expanduser(file_path), 'a') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=attributes)
    formatted_data = write_results_to_file(object_type, writer, response, attributes)
    if sub_pull:
      sub_pull(sub_pull_arg(formatted_data))
    for page in range(2, pages + 1):
      response = get_data(object_type, page=page, **kwargs)
      write_results_to_file(object_type, writer, response, attributes)

In [21]:
def pull_committee_donation_data(committee_ids=''):
  pull_and_write_data('donations', committee_ids=committee_ids)

In [22]:
def pull_candidate_committee_data(candidate_ids=''):
  donation_args = partial(get_object_ids, 'committee_id')
  pull_and_write_data('committees', 
                      sub_pull=pull_committee_donation_data,
                      sub_pull_arg=donation_args,
                      candidate_ids=candidate_ids)

In [23]:
def pull_candidate_donation_data(**kwargs):
  candidates_config = CONFIGS['candidates']
  committees_config = CONFIGS['committees']
  donations_config = CONFIGS['donations']
  for config in [candidates_config, committees_config, donations_config]:
    with open(os.path.expanduser(config['write_path']), 'w') as csvfile:
      writer = csv.DictWriter(csvfile, fieldnames=config['attributes'])
      writer.writeheader()
  committee_args = partial(get_object_ids, 'candidate_id')
  pull_and_write_data('candidates', 
                      sub_pull=pull_candidate_committee_data, 
                      sub_pull_arg=committee_args,
                      **kwargs)

In [None]:
# This is the main function
pull_candidate_donation_data(election_year=2016, office='H', party='DEM')


pulling candidates
pulling committees
pulling donations


In [None]:
pull_and_write_data('candidates', election_year=2016, office='H', party='DEM')
pull_and_write_data('committees', candidate_id=get_object_ids(candidates_data, 'candidate_id'))

In [None]:
# This is probably too much data to store in memory. 
# Define a writer function and do it incrementally within pull_pages

candidates_data = pull_pages('candidates', election_year=2016, office='H', party='DEM')
committees_data = pull_pages('committees', candidate_id=get_object_ids(candidates_data, 'candidate_id'))
donations_data = []

for committee_id in get_object_ids(committees, 'committee_id'):
  donations_subset = [format_record(donation, DONATION_ATTRIBUTES) for donation in pull_pages('donations', committee_id=committee_id, is_individual="false")]
  donation_data.extend(donations_subset)
  


In [None]:
####################################
#### EXAMPLE USAGE: CANDIDATES ####
####################################
response = get_data('candidates', election_year=2016, office='H', party='DEM')
candidate_date = response['results']
candidates = [format_record(candidate, CANDIDATE_ATTRIBUTES) for candidate in candidate_data]
response

In [None]:
####################################
####   EXAMPLE USAGE: FILINGS   ####
####################################

# Unable to differentiate general election from primaries
# Tried to use filings endpoing to use the 'general__primary_identifier' but it's always None
# Added a feature to pass dynamic endpoint params anyway, could be useful for other stuff. 
# Just pass as *args and denote the interpolation in the endpoint config
# test_filing = get_data('filings', dynamic_endpoint=['H6WY00167'])
# test_filing

In [None]:
####################################
####  EXAMPLE USAGE: COMMITTEES ####
####################################

committees_data = get_data('committees', candidate_id=get_object_ids(candidates, 'candidate_id'))['results']
committees = [format_record(committee, COMMITTEE_ATTRIBUTES) for committee in committees_data]
committees[:5]

In [None]:
####################################
####  EXAMPLE USAGE: DONATIONS  ####
####################################
donation_data = []
for committee_id in get_object_ids(committees, 'committee_id'):
  donations_subset = [format_record(donation, DONATION_ATTRIBUTES) for donation in get_data('donations', committee_id=committee_id, is_individual="false")['results']]
  donation_data.extend(donations_subset)

In [None]:
# Write data to respective files and trigger copy from local file to RDS
# Write each of these files to S3 for easy access and backup

# Write data to file
def write_to_file(record):
  pass

# Python script to read in the rds config file and run copy command 
def copy_to_rds(file):
  pass

def push_to_s3(file):
  pass

In [None]:
### IGNORE EVERYTHING BELOW THIS FOR NOW
### JUST GET RAW DATA INTO THE DB USING THE CODE ABOVE. 
### WE CAN NORMALIZE AND UPDATE THE SCHEMA LATER IF WE NEED TO


In [None]:

def split_running_mates(office, campaign_name):
  candidates = campaign_name.split('/')
  candidate = candidates[0].strip(' ')
  running_mate = candidates[1].strip(' ') if len(candidates) == 2 else 'N/A'
  return (candidate, running_mate)

In [None]:
def format_candidate(candidate_id, candidate):
    if ',' in candidate:
      names = candidate.split(',')
      print('names: ', names)
      last_name = names[0]
      other_names = names[1].strip(' ').split(' ')
      print('other names:', other_names)      
      first_name = other_names[0]
      if len(other_names) > 1:
        middle_names = ' '.join(other_names[1:])
      else:
        middle_names = ''
    else:
      names = candidate.split(' ')
      first_name = names[0]
      last_name = names[-1]
      middle_names = ' '.join(names[1:-1])
    return {'id': candidate_id, 'first_name': first_name, 'middle_names': middle_names, 'last_name': last_name}
      

In [None]:
def format_campaign_data(candidate_data):
  campaigns = []
  for record in candidate_data:
    election_years = record.get('election_years', [])
    office = record.get('office_full', 'N/A')
    campaign_name = record.get('name', 'Unknown')
    candidate_id = record.get('candidate_id', '')
    for election_year in election_years:
      campaign = {
        'id': candidate_id + '-' + str(election_year), 
        'campaign_name': campaign_name,
        'office': office
      }
      candidate, running_mate = split_running_mates(office, campaign_name)
      campaign['candidate'] = candidate
      campaign['running_mate'] = running_mate
      campaigns.append(campaign)
  return campaigns  

In [None]:
# campaigns = format_campaign_data(candidates)
# unique_candidates = set([campaign['candidate'] for campaign in campaigns] + [campaign['running_mate'] for campaign in campaigns if campaign['running_mate'] != 'N/A'])
# labeled_candidates = [format_candidate(index, candidate) for index, candidate in enumerate(unique_candidates)]
# labeled_candidates