In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
import time, os, re
from tqdm import tqdm

In [2]:
selenium_filepath = "C:\GIT\SELENIUM_DRIVERS\chromedriver_win32\chromedriver.exe"

start_yyyymmdd = "01/01/1990"
end_yyyymmdd = "09/29/2022"
save_root_dir = './Statements'

url = "https://www.federalreserve.gov/monetarypolicy/materials/"

In [3]:
driver = webdriver.Chrome(selenium_filepath)
driver.get(url)

In [4]:
# set start date
start_date = driver.find_element_by_name("startmodel")
start_date.clear()
start_date.send_keys(start_yyyymmdd)

# set end date
end_date = driver.find_element_by_name("endmodel")
end_date.clear()
end_date.send_keys(end_yyyymmdd)

# select policy statements
statement_checkbox = driver.find_element_by_xpath("//label/input[contains(..,'Policy Statements')]")
statement_checkbox.click()

# apply filter
submit = driver.find_element_by_css_selector(".btn.btn-primary")
submit.click()

In [5]:
# get the page control row
pagination = driver.find_element_by_class_name('pagination')

# go to the last page to find the largest page number
last_page = pagination.find_element_by_link_text('Last')
last_page.click()
pages = pagination.text.split('\n')
largest_page = int(pages[-3])

### Scrape URLs

In [6]:
statement_url_list = []
# go back to first page and start the loop
first_page = pagination.find_element_by_link_text('First')
first_page.click()
next_page = pagination.find_element_by_link_text('Next')
for i in range(largest_page):
    # now to get the items inside
    main = driver.find_element_by_css_selector(".panel.panel-default") # get the app panel
    material_types = main.find_elements_by_css_selector(".fomc-meeting__month.col-xs-5.col-sm-3.col-md-4") # get the 2nd col
    material_types = [element.text for element in material_types] # to get the words
    material_links = main.find_elements_by_css_selector(".fomc-meeting__month.col-xs-5.col-sm-3.col-md-2") # get the 3rd col
    html_elements = [element.find_element_by_link_text('HTML') for element in material_links] # get the html ones
    # add url to statement_url_list if it is a statement
    statement_url_list.extend([html_elements[i].get_attribute('href') for i, j in enumerate(material_types) if j=='Statement'])
    next_page.click()
print('Number of URLs: {}'.format(len(statement_url_list)))

Number of URLs: 222


### Scrpae documents

In [7]:
def get_text_for_a_statement_from_2006_to_2022(soup):
    return soup.find('div', class_ = 'col-xs-12 col-sm-8 col-md-8').text.strip()

def get_text_for_a_statement_from_1996_to_2005(soup):
    return '\n'.join([item.text.strip() for item in soup.select('table td')])

def get_text_for_a_statement_from_1994_to_1995(soup):
    return soup.find('div', id="content").text.strip()

In [8]:
doc_count = 0
for statement_url in tqdm(statement_url_list):
    statement_resp = requests.get(statement_url)
    statement_soup = BeautifulSoup(statement_resp.content, 'lxml')
    
    for item in re.findall('[0-9]+', statement_url):
        if len(item)==8:
            yyyymmdd = item
    
    year = int(yyyymmdd[:4])
    if year >= 2006:
        article = get_text_for_a_statement_from_2006_to_2022(statement_soup)
    elif year >=1996:
        article = get_text_for_a_statement_from_1996_to_2005(statement_soup)
    else:
        article = get_text_for_a_statement_from_1994_to_1995(statement_soup)
        
    save_dir = os.path.join(save_root_dir, yyyymmdd[:4])
    if not os.path.exists(save_dir): os.makedirs(save_dir)
    save_filepath = os.path.join(save_dir, '{}.txt'.format(yyyymmdd))
    with open(save_filepath, "w", encoding='utf-8-sig') as file:
        file.write("{}\n".format(statement_url))
        file.write(article)
        doc_count += 1

print('Number of documents: {}'.format(doc_count))

100%|████████████████████████████████████████████████████████████████████████████████| 222/222 [02:02<00:00,  1.81it/s]

Number of documents: 222





### Validation of scraping results

In [9]:
import os
from glob import glob
from tqdm import tqdm
import pandas as pd

In [11]:
filepaths = glob('./Statements/*/*.txt')
print('Number of saved documents (excluding duplicated data): {}'.format(len(filepaths)))

Number of saved documents (excluding duplicated data): 213


In [39]:
records = []
for filepath in tqdm(filepaths):
    with open(filepath, 'r', encoding='utf-8-sig') as file:
        filename = os.path.basename(filepath)
        url = file.readline().replace('\n', '')
        doc = ''.join(file.readlines())
        num_of_characters = len(doc)

        records.append((filename, url, num_of_characters, doc))
    
df = pd.DataFrame(records, columns=['filename', 'url', 'num_of_characters', 'document'])
df

100%|██████████████████████████████████████████████████████████████████████████████| 213/213 [00:00<00:00, 3258.15it/s]


Unnamed: 0,filename,url,num_of_characters,document
0,19940204.txt,https://www.federalreserve.gov/fomc/19940204de...,779,"Release Date: February 4, 1994 \nFor immediate..."
1,19940322.txt,https://www.federalreserve.gov/fomc/19940322de...,396,"Release Date: March 22, 1994 \nFor immediate ..."
2,19940418.txt,https://www.federalreserve.gov/fomc/19940418de...,375,"Release Date: April 18, 1994 \nFor immediate ..."
3,19940517.txt,https://www.federalreserve.gov/fomc/19940517de...,1265,"Release Date: May 17, 1994 \nFor immediate re..."
4,19940816.txt,https://www.federalreserve.gov/fomc/19940816de...,1334,"Release Date: August 16, 1994 \nFor immediate ..."
...,...,...,...,...
208,20220126.txt,https://www.federalreserve.gov/newsevents/pres...,2808,Indicators of economic activity and employment...
209,20220316.txt,https://www.federalreserve.gov/newsevents/pres...,2307,Indicators of economic activity and employment...
210,20220504.txt,https://www.federalreserve.gov/newsevents/pres...,2464,Although overall economic activity edged down ...
211,20220615.txt,https://www.federalreserve.gov/newsevents/pres...,2447,Overall economic activity appears to have pick...


In [40]:
df.sort_values(by=['num_of_characters'], inplace=True)
df

Unnamed: 0,filename,url,num_of_characters,document
2,19940418.txt,https://www.federalreserve.gov/fomc/19940418de...,375,"Release Date: April 18, 1994 \nFor immediate ..."
1,19940322.txt,https://www.federalreserve.gov/fomc/19940322de...,396,"Release Date: March 22, 1994 \nFor immediate ..."
84,20070810.txt,https://www.federalreserve.gov/newsevents/pres...,527,The Federal Reserve is providing liquidity to ...
7,19950706.txt,https://www.federalreserve.gov/fomc/19950706de...,578,"Release Date: July 6, 1995\n \nFor immediate r..."
8,19951219.txt,https://www.federalreserve.gov/fomc/19951219de...,671,"Release Date: December 19, 1995\n \nFor immedi..."
...,...,...,...,...
141,20140129.txt,https://www.federalreserve.gov/newsevents/pres...,5786,Information received since the Federal Open Ma...
140,20131218.txt,https://www.federalreserve.gov/newsevents/pres...,5998,Information received since the Federal Open Ma...
145,20140730.txt,https://www.federalreserve.gov/newsevents/pres...,6002,Information received since the Federal Open Ma...
142,20140319.txt,https://www.federalreserve.gov/newsevents/pres...,6188,Information received since the Federal Open Ma...


In [50]:
for _, row in df[:5].iterrows():
    print(row['url'])
    print(row['document'])
    print('===\n')

https://www.federalreserve.gov/fomc/19940418default.htm
Release Date: April 18, 1994  
For immediate release


Chairman Alan Greenspan announced today that the Federal Reserve will increase slightly the degree of pressure on reserve positions. This action is expected to be associated with a small increase in short-term money market interest rates. 



Home | Press releases
Accessibility | Contact Us

Last update: April 20, 2007
===

https://www.federalreserve.gov/fomc/19940322default.htm
Release Date: March 22, 1994  
For immediate release


 Chairman Alan Greenspan announced today that the Federal Open Market Committee decided to increase slightly the degree of pressure on reserve positions. This action is expected to be associated with a small increase in short-term money market interest rates.




Home | Press releases
Accessibility | Contact Us

Last update: April 20, 2007
===

https://www.federalreserve.gov/newsevents/press/monetary/20070810a.htm
The Federal Reserve is providing l