In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import time, os, re
from tqdm import tqdm

In [2]:
selenium_filepath = "C:\GIT\SELENIUM_DRIVERS\chromedriver_win32\chromedriver.exe"

start_yyyymmdd = "01/01/1990"
end_yyyymmdd = "09/29/2022"
save_root_dir = './Minutes'

url = "https://www.federalreserve.gov/monetarypolicy/materials/"

In [3]:
driver = webdriver.Chrome(selenium_filepath)
driver.get(url)

In [4]:
# set start date
start_date = driver.find_element_by_name("startmodel")
start_date.clear()
start_date.send_keys(start_yyyymmdd)

# set end date
end_date = driver.find_element_by_name("endmodel")
end_date.clear()
end_date.send_keys(end_yyyymmdd)

# select policy statements
statement_checkbox = driver.find_element_by_xpath("//label/input[contains(..,'Minutes (1993-Present)')]")
statement_checkbox.click()

# apply filter
submit = driver.find_element_by_css_selector(".btn.btn-primary")
submit.click()

In [5]:
# get the page control row
pagination = driver.find_element_by_class_name('pagination')

# go to the last page to find the largest page number
last_page = pagination.find_element_by_link_text('Last')
last_page.click()
pages = pagination.text.split('\n')
largest_page = int(pages[-3])

### Scrape URLs

In [6]:
statement_url_list = []
# go back to first page and start the loop
first_page = pagination.find_element_by_link_text('First')
first_page.click()
next_page = pagination.find_element_by_link_text('Next')
for i in range(largest_page):
    # now to get the items inside
    main = driver.find_element_by_css_selector(".panel.panel-default") # get the app panel
    material_types = main.find_elements_by_css_selector(".fomc-meeting__month.col-xs-5.col-sm-3.col-md-4") # get the 2nd col
    material_types = [element.text for element in material_types] # to get the words
    material_links = main.find_elements_by_css_selector(".fomc-meeting__month.col-xs-5.col-sm-3.col-md-2") # get the 3rd col
    
    html_elements = []
    for element in material_links:
        try:
            html_elements.append(element.find_element_by_link_text('HTML'))
        except:
            continue

    # add url to statement_url_list if it is a statement
    statement_url_list.extend([html_elements[i].get_attribute('href') for i, j in enumerate(material_types) if j=='Minutes'])
    next_page.click()
print('Number of URLs: {}'.format(len(statement_url_list)))

Number of URLs: 258


### Scrpae documents

In [7]:
def get_text_for_a_statement_from_201201_to_202209(soup):
    return soup.find('div', class_ = 'col-xs-12 col-sm-8 col-md-9').text.strip()

def get_text_for_a_statement_from_200710_to_201112(soup):
    return soup.find('div', id="leftText").text.strip()

def get_text_for_a_statement_from_199601_to_200709(soup):
    return '\n'.join([item.text.strip() for item in soup.select('table td')])

def get_text_for_a_statement_from_199401_to_199512(soup):
    return soup.find('div', id="content").text.strip()

In [8]:
for statement_url in tqdm(statement_url_list):
    statement_resp = requests.get(statement_url)
    statement_soup = BeautifulSoup(statement_resp.content, 'lxml')
    
    for item in re.findall('[0-9]+', statement_url):
        if len(item)==8:
            yyyymmdd = item
    
    yearmonth = int(yyyymmdd[:6])
    if yearmonth >= 201201:
        article = get_text_for_a_statement_from_201201_to_202209(statement_soup)
    elif yearmonth >= 200710:
        article = get_text_for_a_statement_from_200710_to_201112(statement_soup)
    elif yearmonth >= 199601:
        article = get_text_for_a_statement_from_199601_to_200709(statement_soup)    
    else:
        article = get_text_for_a_statement_from_199401_to_199512(statement_soup)
        
    save_dir = os.path.join(save_root_dir, yyyymmdd[:4])
    if not os.path.exists(save_dir): os.makedirs(save_dir)
    save_filepath = os.path.join(save_dir, '{}.txt'.format(yyyymmdd))
    with open(save_filepath, "w", encoding='utf-8-sig') as file:
        file.write("{}\n".format(statement_url))
        file.write(article)

100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [02:33<00:00,  1.68it/s]


### Validation of scraping results

In [9]:
import os
from glob import glob
from tqdm import tqdm
import pandas as pd

In [10]:
filepaths = glob('./Minutes/*/*.txt')
print('Number of saved documents (excluding duplicated data): {}'.format(len(filepaths)))

Number of saved documents (excluding duplicated data): 236


In [11]:
records = []
for filepath in tqdm(filepaths):
    with open(filepath, 'r', encoding='utf-8-sig') as file:
        filename = os.path.basename(filepath)
        url = file.readline().replace('\n', '')
        doc = ''.join(file.readlines())
        num_of_characters = len(doc)

        records.append((filename, url, num_of_characters, doc))
    
df = pd.DataFrame(records, columns=['filename', 'url', 'num_of_characters', 'document'])
df

100%|████████████████████████████████████████████████████████████████████████████████| 236/236 [00:04<00:00, 56.48it/s]


Unnamed: 0,filename,url,num_of_characters,document
0,19930203.txt,https://www.federalreserve.gov/fomc/MINUTES/19...,74824,Minutes of the Federal Open Market Committee\n...
1,19930323.txt,https://www.federalreserve.gov/fomc/MINUTES/19...,42105,Minutes of the Federal Open Market Committee \...
2,19930518.txt,https://www.federalreserve.gov/fomc/MINUTES/19...,36539,Minutes of the Federal Open Market Committee\n...
3,19930707.txt,https://www.federalreserve.gov/fomc/MINUTES/19...,50646,Minutes of the Federal Open Market Committee \...
4,19930817.txt,https://www.federalreserve.gov/fomc/MINUTES/19...,33768,Minutes of the Federal Open Market Committee\n...
...,...,...,...,...
231,20211215.txt,https://www.federalreserve.gov/monetarypolicy/...,62888,Minutes of the Federal Open Market Committee\n...
232,20220126.txt,https://www.federalreserve.gov/monetarypolicy/...,90001,Minutes of the Federal Open Market Committee\n...
233,20220316.txt,https://www.federalreserve.gov/monetarypolicy/...,57810,Minutes of the Federal Open Market Committee\n...
234,20220504.txt,https://www.federalreserve.gov/monetarypolicy/...,52473,Minutes of the Federal Open Market Committee\n...


In [12]:
df.sort_values(by=['num_of_characters'], inplace=True)
df

Unnamed: 0,filename,url,num_of_characters,document
100,20050809.txt,https://www.federalreserve.gov/fomc/minutes/20...,21795,Minutes of the Federal Open Market Committee\n...
79,20021210.txt,https://www.federalreserve.gov/fomc/minutes/20...,22287,Minutes of the Federal Open Market Committee\n...
101,20050920.txt,https://www.federalreserve.gov/fomc/minutes/20...,23214,Minutes of the Federal Open Market Committee\n...
94,20041110.txt,https://www.federalreserve.gov/fomc/minutes/20...,23575,Minutes of the Federal Open Market Committee\n...
102,20051101.txt,https://www.federalreserve.gov/fomc/minutes/20...,24501,Minutes of the Federal Open Market Committee\n...
...,...,...,...,...
96,20050202.txt,https://www.federalreserve.gov/fomc/minutes/20...,88402,Minutes of the Federal Open Market Committee\n...
232,20220126.txt,https://www.federalreserve.gov/monetarypolicy/...,90001,Minutes of the Federal Open Market Committee\n...
184,20160127.txt,https://www.federalreserve.gov/monetarypolicy/...,90638,Minutes of the Federal Open Market Committee\n...
208,20190130.txt,https://www.federalreserve.gov/monetarypolicy/...,94502,Minutes of the Federal Open Market Committee\n...


In [13]:
for _, row in df[:5].iterrows():
    print(row['url'])
    print(row['document'])
    print('===\n')

https://www.federalreserve.gov/fomc/minutes/20050809.htm
Minutes of the Federal Open Market Committee

August 9, 2005


A meeting of the Federal Open Market Committee was held in the offices of the Board of Governors of the Federal Reserve System in Washington, D.C., on Tuesday, August 9, 2005 at 9:00 a.m.
Present:

Mr. Greenspan, Chairman

Mr. Geithner, Vice Chairman

		Ms. Bies

		Mr. Ferguson

		Mr. Fisher		

		Mr. Kohn

		Mr. Moskow

		Mr. Olson

		Mr. Santomero

		Mr. Stern


Messrs. Guynn and Lacker, Mses. Pianalto and Yellen, 

       Alternate Members of the Federal Open Market Committee


Mr. Hoenig, Ms. Minehan, and Mr. Poole, Presidents of the Federal Reserve 

Banks of Kansas City, Boston, and St. Louis, respectively 





		Ms. Danker, Deputy Secretary

		Ms. Smith, Assistant Secretary

		Mr. Alvarez, General Counsel

		Ms. Johnson, Economist	

		Mr. Stockton, Economist
Messrs. Connors and Madigan, Ms. Mester, Messrs. Rosenblum, Tracy, and 

  	Wilcox, Associate Economists