In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re
import time

data_folder = 'data/'

In [2]:
API_KEY = ""

with open('api_key.txt') as f:
    API_KEY = f.read()

HEADERS = {"X-Api-Key": API_KEY}

In [3]:
def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [4]:
create_folder(data_folder + 'house_members')

### Members congress #115 

In [None]:
u = "https://api.propublica.org/congress/v1/{congress}/House/members.json"

for i in range(115, 115 + 1):
    results = requests.get(url = u.format(congress=i), headers=HEADERS)
    df = pd.io.json.json_normalize(results.json()['results'][0]['members'])
    df.to_csv("data/house_members/house_members_{congress}.csv".format(congress=i))

In [5]:
senate_115_csv = 'data/senate_members/senate_members_115.csv'
house_115_csv = 'data/house_members/house_members_115.csv'
senate_df = pd.read_csv(senate_115_csv)
house_df = pd.read_csv(house_115_csv)

In [6]:
MEMBER_COLUMNS = ['id', 'first_name', 'last_name', 'middle_name', 'party', 'state', 'title', 'url', 'in_office']

senate_df = senate_df[MEMBER_COLUMNS]
senate_df['chamber'] = 's'
house_df = house_df[MEMBER_COLUMNS]
house_df['chamber'] = 'h'

In [7]:
print(senate_df.shape)
print(house_df.shape)

(105, 10)
(454, 10)


In [8]:
members_df = pd.concat([senate_df, house_df])
members_df.set_index('id', inplace=True)

In [9]:
create_folder(data_folder + 'members')
members_df.to_csv("data/members/members_{congress}.csv".format(congress=115))
print(members_df.shape)
members_df.head(1)

(559, 9)


Unnamed: 0_level_0,first_name,last_name,middle_name,party,state,title,url,in_office,chamber
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A000360,Lamar,Alexander,,R,TN,"Senator, 2nd Class",https://www.alexander.senate.gov/public,True,s


### Bills congress #115 

In [10]:
create_folder(data_folder + 'bills')

In [11]:
# there might be 3 cosponsors but only one sponsor id is displayed
BILLS_COLUMNS = ['bill_id', 'title', 'short_title', 'sponsor_id', 'bill_uri', 'active']    

In [None]:
start_time = time.time()

u = "https://api.propublica.org/congress/v1/{congress}/{chamber}/bills/introduced.json?offset={offset}"
initial_reg_offset = 0

for congress_num in range(115, 115 + 1):    
    bills_df = pd.DataFrame(columns = BILLS_COLUMNS)
    bills_df.set_index('bill_id', inplace=True)
    bills_df.to_csv('data/bills/bills_{congress}.csv'.format(congress=congress_num), sep=',', encoding='utf-8')
    
    query_offset = initial_reg_offset
    while True: 
        print(query_offset)
        url_modified = u.format(congress = congress_num, chamber = 'senate', offset = query_offset)
        results = requests.get(url = url_modified, headers=HEADERS)
        num_results = results.json()['results'][0]['num_results']
        if num_results == 0: 
            break            
        bills_df = pd.io.json.json_normalize(results.json()['results'][0]['bills'])[BILLS_COLUMNS]
        bills_df.set_index('bill_id', inplace=True)
        bills_df.to_csv("data/bills/bills_{congress}.csv".format(congress=congress_num),
                        sep=',', encoding='utf-8', mode='a', header=False)
        query_offset += 20

elapsed_time_seconds = round((time.time() - start_time))
elapsed_time_minutes = round(elapsed_time_seconds/60, 2)
print('--- Elapsed time: ---')
print(elapsed_time_seconds, 'seconds')
print(elapsed_time_minutes, 'minutes')

In [12]:
bills_df = pd.read_csv('data/bills/bills_115.csv')
print(bills_df.shape)
bills_df.head(1)

(4466, 6)


Unnamed: 0,bill_id,title,short_title,sponsor_id,bill_uri,active
0,sres693-115,"A resolution celebrating October 25, 2018, as ...","A resolution celebrating October 25, 2018, as ...",R000584,https://api.propublica.org/congress/v1/115/bil...,True
