In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [16]:
# step 1: get the data

# after 1994, orders are available as a bulk csv or json download at
# https://www.federalregister.gov/documents/search.csv?conditions%5Bcorrection%5D=0&conditions%5Bpresident%5D=&conditions%5Bpresidential_document_type%5D=executive_order&conditions%5Bsigning_date%5D%5Byear%5D=&conditions%5Btype%5D%5B%5D=PRESDOCU&fields%5B%5D=citation&fields%5B%5D=document_number&fields%5B%5D=end_page&fields%5B%5D=html_url&fields%5B%5D=pdf_url&fields%5B%5D=type&fields%5B%5D=subtype&fields%5B%5D=publication_date&fields%5B%5D=signing_date&fields%5B%5D=start_page&fields%5B%5D=title&fields%5B%5D=disposition_notes&fields%5B%5D=executive_order_number&order=executive_order&per_page=1000
# so this will only scrape 1937 (when the register starts) through 1994

# orders are on a list of pages, denoted by year (or year-president for inauguration years)
#base_url = 'https://www.archives.gov/federal-register/executive-orders/'
# likely not the most elegant way of doing it, but it gets the job done
years = [*range(1937, 1995)]
inaug_years = [1945, 1953, 1961, 1963, 1969, 1974, 1977, 1981, 1989, 1993]
presidents = ['roosevelt', 'truman', 'eisenhower', 'kennedy', 'johnson',
             'nixon', 'ford', 'carter', 'reagan', 'bush', 'clinton']
year_pres = []
i=0
# create the suffixes for inauguration years
for year in inaug_years:
    year_pres.append(str(year)+'-'+presidents[i])
    i += 1
    year_pres.append(str(year)+'-'+presidents[i])

years = list(set(years) - set(inaug_years))
years = [str(y) for y in years]
years = years + year_pres
print(years)

['1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1962', '1964', '1965', '1966', '1967', '1968', '1970', '1971', '1972', '1973', '1975', '1976', '1978', '1979', '1980', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1990', '1991', '1992', '1994', '1945-roosevelt', '1945-truman', '1953-truman', '1953-eisenhower', '1961-eisenhower', '1961-kennedy', '1963-kennedy', '1963-johnson', '1969-johnson', '1969-nixon', '1974-nixon', '1974-ford', '1977-ford', '1977-carter', '1981-carter', '1981-reagan', '1989-reagan', '1989-bush', '1993-bush', '1993-clinton']


In [19]:
def scraper(url):
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc)
    mainblock = soup.div.find(id = 'block-system-main')
    subs = mainblock.find_all(['p', 'ul'])

    this_list = [sub.text.split('\n', 1) for sub in subs]

    df_titles = pd.DataFrame(this_list[0::2], columns = ['Order Number', 'Name'])
    df_dates = pd.DataFrame(this_list[1::2], columns = ['Signed Date', 'Notes'])
    df = df_titles.join(df_dates)
    return(df)

In [20]:
base_url = 'https://www.archives.gov/federal-register/executive-orders/'
df_all = pd.DataFrame(columns = ['Order Number', 'Name', 'Signed Date', 'Notes'])
for year in years:
    url = base_url + year
    df_all = df_all.append(scraper(url), ignore_index = True)
    print(year)
    
print(df_all.shape)
df_all.to_csv('scraped_data.csv', index = False)

(256, 4)
1937
(249, 4)
1938
(288, 4)
1939
(310, 4)
1940
(384, 4)
1941
(291, 4)
1942
(123, 4)
1943
(101, 4)
1944
(149, 4)
1946
(104, 4)
1947
(110, 4)
1948
(71, 4)
1949
(157, 4)
1950
(176, 4)
1951
(106, 4)
1952
(110, 4)
1954
(97, 4)
1955
(45, 4)
1956
(82, 4)
1957
(74, 4)
1958
(61, 4)
1959
(43, 4)
1960
(132, 4)
1962
(83, 4)
1964
(110, 4)
1965
(58, 4)
1966
(98, 4)
1967
(83, 4)
1968
(73, 4)
1970
(94, 4)
1971
(82, 4)
1972
(65, 4)
1973
(100, 4)
1975
(83, 4)
1976
(79, 4)
1978
(114, 4)
1979
(109, 4)
1980
(93, 4)
1982
(58, 4)
1983
(60, 4)
1984
(67, 4)
1985
(55, 4)
1986
(64, 4)
1987
(59, 4)
1988
(64, 4)
1990
(68, 4)
1991
(59, 4)
1992
(80, 4)
1994
(30, 4)
1945-roosevelt
(140, 4)
1945-truman
(12, 4)
1953-truman
(119, 4)
1953-eisenhower
(15, 4)
1961-eisenhower
(104, 4)
1961-kennedy
(82, 4)
1963-kennedy
(8, 4)
1963-johnson
(13, 4)
1969-johnson
(52, 4)
1969-nixon
(41, 4)
1974-nixon
(43, 4)
1974-ford
(25, 4)
1977-ford
(98, 4)
1977-carter
(26, 4)
1981-carter
(51, 4)
1981-reagan
(7, 4)
1989-reagan
(32, 4