## Federal registry API pull


https://www.federalregister.gov/developers/api/v1  
https://www.federalregister.gov/agencies

In [393]:
import json
import requests
from bs4 import BeautifulSoup, SoupStrainer
import json
import re
import urllib
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

In [26]:
url = "https://www.federalregister.gov/api/v1/documents.json?per_page=20&order=relevance"
response = requests.get(url).json()
len(response)

5

In [27]:
response.keys()

dict_keys(['count', 'description', 'total_pages', 'next_page_url', 'results'])

In [18]:
list(response)

['count', 'description', 'total_pages', 'next_page_url', 'results']

In [19]:
print(response['count'])
print(response['description'])
print(response['total_pages'])
print(response['next_page_url'])

778626
All Documents
50
https://www.federalregister.gov/api/v1/documents.json?order=relevance&page=2&per_page=20


In [25]:
response['results'][0]

{'abstract': 'We are superseding Airworthiness Directive (AD) 2017-11-03 for DG Flugzeugbau GmbH Model DG-500MB gliders that are equipped with a Solo 2625 02 engine modified with a fuel injection system following the instructions of Solo Kleinmoteren GmbH Technische Mitteilung 4600-3 and identified as Solo 2625 02i. This AD results from mandatory continuing airworthiness information (MCAI) issued by an aviation authority of another country to identify and correct an unsafe condition on an aviation product. The MCAI describes the unsafe condition as failure of the connecting rod bearing resulting from too much load on the rod bearings from the engine control unit. This AD adds a model to the applicability. We are issuing this AD to require actions to address the unsafe condition on these products.',
 'agencies': [{'id': 492,
   'json_url': 'https://www.federalregister.gov/api/v1/agencies/492.json',
   'name': 'Transportation Department',
   'parent_id': None,
   'raw_name': 'DEPARTMENT 

In [45]:
len(response['results'][0])

10

In [40]:
response['results'][0].keys()

dict_keys(['title', 'type', 'abstract', 'document_number', 'html_url', 'pdf_url', 'public_inspection_pdf_url', 'publication_date', 'agencies', 'excerpts'])

In [48]:
response['results'][0]['agencies'][0].keys()

dict_keys(['raw_name', 'name', 'id', 'url', 'json_url', 'parent_id', 'slug'])

## With duplicates

In [145]:
# Flatten the dataset, creating duplicate rows when two agencies are listed.
df = json_normalize(response['results'], 'agencies', 
                     ['title', 'type', 'abstract', 'document_number', 'html_url', 'pdf_url',
                      'public_inspection_pdf_url', 'publication_date', 'excerpts'])

In [146]:
df.shape

(34, 16)

In [147]:
df.columns

Index(['id', 'json_url', 'name', 'parent_id', 'raw_name', 'slug', 'url',
       'title', 'type', 'abstract', 'document_number', 'html_url', 'pdf_url',
       'public_inspection_pdf_url', 'publication_date', 'excerpts'],
      dtype='object')

In [150]:
df.head(2)

Unnamed: 0,id,json_url,name,parent_id,raw_name,slug,url,title,type,abstract,document_number,html_url,pdf_url,public_inspection_pdf_url,publication_date,excerpts
0,492,https://www.federalregister.gov/api/v1/agencie...,Transportation Department,,DEPARTMENT OF TRANSPORTATION,transportation-department,https://www.federalregister.gov/agencies/trans...,Airworthiness Directives; DG Flugzeugbau GmbH ...,Rule,We are superseding Airworthiness Directive (AD...,2018-10583,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,2018-05-22,We are superseding Airworthiness Directive (AD...
1,159,https://www.federalregister.gov/api/v1/agencie...,Federal Aviation Administration,492.0,Federal Aviation Administration,federal-aviation-administration,https://www.federalregister.gov/agencies/feder...,Airworthiness Directives; DG Flugzeugbau GmbH ...,Rule,We are superseding Airworthiness Directive (AD...,2018-10583,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,2018-05-22,We are superseding Airworthiness Directive (AD...


In [160]:
df[df['id']==161]['html_url'][18]

'https://www.federalregister.gov/documents/2018/05/22/2018-10810/accelerating-wireline-broadband-deployment-by-removing-barriers-to-infrastructure-investment'

In [79]:
len(df['abstract'][0])

793

In [83]:
df['abstract'].describe()

count                                                    34
unique                                                   20
top       This notice is to invite applications for loan...
freq                                                      2
Name: abstract, dtype: object

In [99]:
# Do the documents have enough text to permit adequate tokenizing?
print('abstract')
print('ave length:', round(df['abstract'].str.len().mean(), 1))
print('min length:', df['abstract'].str.len().min())
print('max length:', df['abstract'].str.len().max())

abstract
ave length: 569.6
min length: 233
max length: 1028


## Without duplicates

In [161]:
# Flatten the dataset, WITHOUT creating duplicate rows when two agencies are listed.
df = json_normalize(response['results'])

In [162]:
df.shape

(20, 10)

In [107]:
df.head(2)

Unnamed: 0,abstract,agencies,document_number,excerpts,html_url,pdf_url,public_inspection_pdf_url,publication_date,title,type
0,We are superseding Airworthiness Directive (AD...,"[{'raw_name': 'DEPARTMENT OF TRANSPORTATION', ...",2018-10583,We are superseding Airworthiness Directive (AD...,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,2018-05-22,Airworthiness Directives; DG Flugzeugbau GmbH ...,Rule
1,The Commodity Futures Trading Commission (Comm...,[{'raw_name': 'COMMODITY FUTURES TRADING COMMI...,2018-10902,The Commodity Futures Trading Commission (Comm...,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,2018-05-22,Foreign Futures and Options Transactions,Rule


In [163]:
df2 = pd.DataFrame(df['agencies'].values.tolist(), index=df.index)
df2.columns=['agency1', 'agency2']
df2.shape

(20, 2)

In [164]:
df2.head()

Unnamed: 0,agency1,agency2
0,"{'raw_name': 'DEPARTMENT OF TRANSPORTATION', '...",{'raw_name': 'Federal Aviation Administration'...
1,{'raw_name': 'COMMODITY FUTURES TRADING COMMIS...,
2,"{'raw_name': 'Social Security Administration',...",
3,{'raw_name': 'DEPARTMENT OF HOMELAND SECURITY'...,"{'raw_name': 'Coast Guard', 'name': 'Coast Gua..."
4,{'raw_name': 'DEPARTMENT OF HOMELAND SECURITY'...,"{'raw_name': 'Coast Guard', 'name': 'Coast Gua..."


In [123]:
df2['agency2'].isnull().sum()

6

In [124]:
df3=pd.DataFrame(df2['agency1'].values.tolist(), index=df2.index)
df3.shape

(20, 7)

In [119]:
df3.head()

Unnamed: 0,id,json_url,name,parent_id,raw_name,slug,url
0,492,https://www.federalregister.gov/api/v1/agencie...,Transportation Department,,DEPARTMENT OF TRANSPORTATION,transportation-department,https://www.federalregister.gov/agencies/trans...
1,77,https://www.federalregister.gov/api/v1/agencie...,Commodity Futures Trading Commission,,COMMODITY FUTURES TRADING COMMISSION,commodity-futures-trading-commission,https://www.federalregister.gov/agencies/commo...
2,470,https://www.federalregister.gov/api/v1/agencie...,Social Security Administration,,Social Security Administration,social-security-administration,https://www.federalregister.gov/agencies/socia...
3,227,https://www.federalregister.gov/api/v1/agencie...,Homeland Security Department,,DEPARTMENT OF HOMELAND SECURITY,homeland-security-department,https://www.federalregister.gov/agencies/homel...
4,227,https://www.federalregister.gov/api/v1/agencie...,Homeland Security Department,,DEPARTMENT OF HOMELAND SECURITY,homeland-security-department,https://www.federalregister.gov/agencies/homel...


In [126]:
df2['agency2'].head()

0    {'raw_name': 'Federal Aviation Administration'...
1                                                 None
2                                                 None
3    {'raw_name': 'Coast Guard', 'name': 'Coast Gua...
4    {'raw_name': 'Coast Guard', 'name': 'Coast Gua...
Name: agency2, dtype: object

In [127]:
df2['agency2'][0]['name']

'Federal Aviation Administration'

In [137]:
# Extract the sub-agency
def extractor (col):
    try:
        return(col['name'])
    except:
        pass
df2['subagency']=df2['agency2'].apply(extractor)

## Okay, let's do that all in one blow (no duplicates, but extract both agencies)

In [217]:
# Pull the top 20 most recent registry items
response = requests.get("https://www.federalregister.gov/api/v1/documents.json?per_page=20&order=relevance").json()

In [None]:
# Flatten the dataset, WITHOUT creating duplicate rows when two agencies are listed.
df = json_normalize(response['results'])

In [218]:
# Flatten the "agencies" dictionary column into a new dataset
df2 = pd.DataFrame(df['agencies'].values.tolist(), index=df.index, columns=['agency1', 'agency2'])

In [219]:
# Extract the name of the sub-agency as a separate column
def extractor (col):
    try:
        return(col['name'])
    except:
        pass
df2['subagency']=df2['agency2'].apply(extractor)

In [220]:
# Flatten the first agency and its metadata into a separate dataset
df3=pd.DataFrame(df2['agency1'].values.tolist(), index=df2.index)

In [221]:
# Join the first-agency dataset with the subagency column
df4=df3.join(df2, how='outer').drop(['agency1', 'agency2'], axis=1)

In [222]:
# Join that dataset into the original dataset of Registry entries
df5=df.join(df4,how='outer').drop('agencies', axis=1)

In [223]:
print(df5.shape)
print(df5.columns)

(20, 17)
Index(['abstract', 'document_number', 'excerpts', 'html_url', 'pdf_url',
       'public_inspection_pdf_url', 'publication_date', 'title', 'type', 'id',
       'json_url', 'name', 'parent_id', 'raw_name', 'slug', 'url',
       'subagency'],
      dtype='object')


In [209]:
df5.head(2)

Unnamed: 0,abstract,document_number,excerpts,html_url,pdf_url,public_inspection_pdf_url,publication_date,title,type,id,json_url,name,parent_id,raw_name,slug,url,agency2
0,We are superseding Airworthiness Directive (AD...,2018-10583,We are superseding Airworthiness Directive (AD...,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,2018-05-22,Airworthiness Directives; DG Flugzeugbau GmbH ...,Rule,492,https://www.federalregister.gov/api/v1/agencie...,Transportation Department,,DEPARTMENT OF TRANSPORTATION,transportation-department,https://www.federalregister.gov/agencies/trans...,Federal Aviation Administration
1,The Commodity Futures Trading Commission (Comm...,2018-10902,The Commodity Futures Trading Commission (Comm...,https://www.federalregister.gov/documents/2018...,https://www.gpo.gov/fdsys/pkg/FR-2018-05-22/pd...,https://s3.amazonaws.com/public-inspection.fed...,2018-05-22,Foreign Futures and Options Transactions,Rule,77,https://www.federalregister.gov/api/v1/agencie...,Commodity Futures Trading Commission,,COMMODITY FUTURES TRADING COMMISSION,commodity-futures-trading-commission,https://www.federalregister.gov/agencies/commo...,


In [224]:
df5['subagency'].head(5)

0    Federal Aviation Administration
1                               None
2                               None
3                        Coast Guard
4                        Coast Guard
Name: subagency, dtype: object

## Pull the top result from a specific agency

In [232]:
string='defense-department'
url = "https://www.federalregister.gov/api/v1/documents.json?per_page=20&order=relevance&conditions%5Bagencies%5D%5B%5D=" + string
response = requests.get(url).json()['results'][0]
response

{'abstract': 'Army National Military Cemeteries (ANMC) is honoring the requests of four families to disinter the human remains of four Native American students from the Carlisle Barracks Post Cemetery, Carlisle, Pennsylvania. The decedent names are Little Plume (aka Hayes Vanderbilt Friday), George Ell (aka George Eli), Herbert Little Hawk (aka Herbert J. Littlehawk), and Her Pipe Woman (aka as Dora Brave Bull). These students died in the 1880s and 1890s while attending the Carlisle Indian Industrial School. At the request of the closest living relative for each decedent, ANMC will disinter, transfer custody, transport, and reinter the remains in private cemeteries chosen by the families. This disinterment will be conducted in accordance with Army Regulation 210- 190. This is not a Native American Graves Protection and Repatriation Act (NAGPRA) action because the remains are not part of a collection as they are interred in graves that are individually marked at the Carlisle Barracks Po

## Extract the complete list of agencies

In [305]:
response=requests.get('https://www.federalregister.gov/agencies')

In [306]:
response.text[:500]

'<!DOCTYPE html>\n  <head>\n    <title>\n      Federal Register\n       :: \n        \n  Agencies\n\n    </title>\n\n    <!-- Tell search engines to use the proper hostname and protocol -->\n    <link href="https://www.federalregister.gov/agencies" rel="canonical" />\n\n    <link rel="Shortcut Icon" href="/favicon.ico">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n\n    <script src="https://use.typekit.net/rgf3yox.js"></script>\n<script type="text/javascript">\n  try {\n    Typekit.l'

<a href="https://www.federalregister.gov/agencies/administrative-conference-of-the-united-states">Administrative Conference of the United States</a>

In [307]:
soup = BeautifulSoup(response.text, 'lxml')

In [410]:
agency_list=soup.find_all('ul', attrs={'class' : "col-xs-12 col-md-12 filterable-list agency-list"})

In [534]:
list=[]
for a in soup.find_all(href=True):
    list.append(a.get('href'))

In [535]:
links=pd.DataFrame(list)

In [536]:
links[0][350]

'https://www.federalregister.gov/agencies/office-of-the-chief-financial-officer-agriculture-department'

In [537]:
links[0][350].split('https://www.federalregister.gov/agencies/')[1]

'office-of-the-chief-financial-officer-agriculture-department'

In [538]:
def name_extractor (col):
    try:
        return col.split('https://www.federalregister.gov/agencies/')[1]
    except:
        return 'killme'

In [539]:
links['agency_name']=links[0].apply(name_extractor)

In [540]:
links['agency_name'][350]

'office-of-the-chief-financial-officer-agriculture-department'

In [541]:
links['agency_name'][0]

'killme'

In [542]:
links=links.loc[links['agency_name']!='killme']

In [543]:
len(links)

443

In [544]:
links.head()

Unnamed: 0,0,agency_name
16,https://www.federalregister.gov/agencies/secur...,securities-and-exchange-commission
26,https://www.federalregister.gov/agencies/natio...,national-oceanic-and-atmospheric-administration
36,https://www.federalregister.gov/agencies/coast...,coast-guard
46,https://www.federalregister.gov/agencies/natio...,national-oceanic-and-atmospheric-administration
56,https://www.federalregister.gov/agencies/posta...,postal-regulatory-commission


In [558]:
links=links['agency_name'].sort_values(ascending=True)

KeyError: 'agency_name'

In [559]:
links.to_csv('federal_agencies.csv', index=False, header=True)

In [561]:
pd.read_csv('federal_agencies.csv')

Unnamed: 0,agency_name
0,action
1,administration-office-executive-office-of-the-...
2,administrative-conference-of-the-united-states
3,administrative-office-of-united-states-courts
4,advisory-council-on-historic-preservation
5,advocacy-and-outreach-office
6,african-development-foundation
7,agency-for-healthcare-research-and-quality
8,agency-for-international-development
9,agency-for-toxic-substances-and-disease-registry
