In [7]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [8]:
# step 1: get the data

# after 1994, orders are available as a bulk csv or json download at
# https://www.federalregister.gov/documents/search.csv?conditions%5Bcorrection%5D=0&conditions%5Bpresident%5D=&conditions%5Bpresidential_document_type%5D=executive_order&conditions%5Bsigning_date%5D%5Byear%5D=&conditions%5Btype%5D%5B%5D=PRESDOCU&fields%5B%5D=citation&fields%5B%5D=document_number&fields%5B%5D=end_page&fields%5B%5D=html_url&fields%5B%5D=pdf_url&fields%5B%5D=type&fields%5B%5D=subtype&fields%5B%5D=publication_date&fields%5B%5D=signing_date&fields%5B%5D=start_page&fields%5B%5D=title&fields%5B%5D=disposition_notes&fields%5B%5D=executive_order_number&order=executive_order&per_page=1000
# so this will only scrape 1937 (when the register starts) through 1994

# orders are on a list of pages, denoted by year (or year-president for inauguration years)
#base_url = 'https://www.archives.gov/federal-register/executive-orders/'
# likely not the most elegant way of doing it, but it gets the job done
years = [*range(1937, 1995)]
inaug_years = [1945, 1953, 1961, 1963, 1969, 1974, 1977, 1981, 1989, 1993]
presidents = ['roosevelt', 'truman', 'eisenhower', 'kennedy', 'johnson',
             'nixon', 'ford', 'carter', 'reagan', 'bush', 'clinton']
year_pres = []
i=0
# create the suffixes for inauguration years
for year in inaug_years:
    year_pres.append(str(year)+'-'+presidents[i])
    i += 1
    year_pres.append(str(year)+'-'+presidents[i])

years = list(set(years) - set(inaug_years))
years = [str(y) for y in years]
years = years + year_pres
print(years)

['1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1962', '1964', '1965', '1966', '1967', '1968', '1970', '1971', '1972', '1973', '1975', '1976', '1978', '1979', '1980', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1990', '1991', '1992', '1994', '1945-roosevelt', '1945-truman', '1953-truman', '1953-eisenhower', '1961-eisenhower', '1961-kennedy', '1963-kennedy', '1963-johnson', '1969-johnson', '1969-nixon', '1974-nixon', '1974-ford', '1977-ford', '1977-carter', '1981-carter', '1981-reagan', '1989-reagan', '1989-bush', '1993-bush', '1993-clinton']


In [9]:
def scraper(url):
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc)
    mainblock = soup.div.find(id = 'block-system-main')
    subs = mainblock.find_all(['p', 'ul'])

    this_list = [sub.text for sub in subs]

    df = pd.DataFrame(this_list, columns = ['data'])
    return(df)

In [12]:
base_url = 'https://www.archives.gov/federal-register/executive-orders/'
df_all = pd.DataFrame(columns = ['data'])
for year in years:
    url = base_url + year
    df_all = df_all.append(scraper(url), ignore_index = True)
    print(year)

1937
1938
1939
1940
1941
1942
1943
1944
1946
1947
1948
1949
1950
1951
1952
1954
1955
1956
1957
1958
1959
1960
1962
1964
1965
1966
1967
1968
1970
1971
1972
1973
1975
1976
1978
1979
1980
1982
1983
1984
1985
1986
1987
1988
1990
1991
1992
1994
1945-roosevelt
1945-truman
1953-truman
1953-eisenhower
1961-eisenhower
1961-kennedy
1963-kennedy
1963-johnson
1969-johnson
1969-nixon
1974-nixon
1974-ford
1977-ford
1977-carter
1981-carter
1981-reagan
1989-reagan
1989-bush
1993-bush
1993-clinton


In [13]:
print(df_all.shape)

#strip leading and trailing whitespaces for easier processing
df_all['data'] = df_all['data'].str.strip()

#replace empty strings and recurring non-data with NaN to drop
df_all = df_all.replace(r'^\s*$', np.NaN, regex=True)
df_all = df_all.replace('Top of Page', np.NaN, regex=False)
df_all.dropna(axis = 0, how = 'all', inplace = True)

print(df_all.shape)
df_all.to_csv('scraped_data.csv', index = False)

(12867, 1)
(10900, 1)
