In [1]:
import time, os, re, pickle, argparse, shutil
from bs4 import BeautifulSoup
from datetime import datetime
from glob import glob
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import requests
from daterangeparser import parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

start_mmddyyyy = "01/01/1993"
end_mmddyyyy = "08/22/2023"

chromedriver_filepath = "C:\GIT\SELENIUM_DRIVERS\chromedriver-win64\chromedriver.exe"
save_root_dir = './Minutes'

url = "https://www.federalreserve.gov/monetarypolicy/materials/"


def extract_begin_end_dates(date_range):
    if '-' not in date_range:
        parsed, _ = parse(date_range)
        return parsed, parsed
    
    elif '/' in date_range:
        begin_month, end_month, begin_date, end_date, year = date_range.replace(',', '').replace('-', ' ').replace('/', ' ').split(' ')
        date_range = f'{begin_month} {begin_date}-{end_month} {end_date}, {year}'
        return parse(date_range)
        
    else:
        return parse(date_range)


def prepare_resources_for_scraping(chromedriver_filepath, url, start_mmddyyyy, end_mmddyyyy):
    driver = webdriver.Chrome(chromedriver_filepath)
    driver.get(url)
    time.sleep(5)
    
    # set start date
    start_date = driver.find_element_by_name("startmodel")
    start_date.clear()
    start_date.send_keys(start_mmddyyyy)

    # set end date
    end_date = driver.find_element_by_name("endmodel")
    end_date.clear()
    end_date.send_keys(end_mmddyyyy)

    # select items
    xpath_strings = "//label/input[contains(..,'Minutes (1993-Present)')]"
    minute_checkbox = driver.find_element_by_xpath(xpath_strings)
    minute_checkbox.click()

    # apply filter
    submit = driver.find_element_by_css_selector(".btn.btn-primary")
    submit.click()
    
    # get the page control row
    pagination = driver.find_element_by_class_name('pagination')

    # go to the last page to find the largest page number
    last_page = pagination.find_element_by_link_text('Last')
    last_page.click()
    pages = pagination.text.split('\n')
    largest_page = int(pages[-3])
    
    return driver, pagination, largest_page

def extract_meetingdate_documentdate_minuteurl(soup):
    meeting_date = soup.select('strong')[0].text
    document_date = soup.select('em')[0].text
    minute_url = 'https://www.federalreserve.gov/{}'.format([item for item in soup.select('a') if 'HTML' in item.text][0]['href'])
    return meeting_date, document_date, minute_url

def scrape_URLs_and_meeting_dates_and_document_dates(driver, pagination, largest_page):
    meeting_date_list, document_date_list, minute_url_list = [], [], []
    # go back to first page and start the loop
    first_page = pagination.find_element_by_link_text('First')
    first_page.click()
    next_page = pagination.find_element_by_link_text('Next')
    
    for _ in range(largest_page):
        driver.find_element_by_css_selector(".panel.panel-default") 
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        
        rows = soup.select('div.row.fomc-meeting')[1:]
        for one_row in rows:
            try:
                if one_row.select('.fomc-meeting__month.col-xs-5.col-sm-3.col-md-4')[0].text.strip()=='Minutes':
                    # Extract minutes written in HTML format
                    meeting_date, document_date, minute_url = extract_meetingdate_documentdate_minuteurl(one_row)
                    meeting_date_list.append(meeting_date)
                    document_date_list.append(document_date)
                    minute_url_list.append(minute_url)
            except:
                continue
        
        next_page.click()
    print('Number of URLs: {}'.format(len(minute_url_list)))
    
    return minute_url_list, meeting_date_list, document_date_list

def get_text_for_a_minute_from_201201_to_202209(soup):
    return soup.find('div', class_ = 'col-xs-12 col-sm-8 col-md-9').text.strip()

def get_text_for_a_minute_from_200710_to_201112(soup):
    return soup.find('div', id="leftText").text.strip()

def get_text_for_a_minute_from_199601_to_200709(soup):
    return '\n'.join([item.text.strip() for item in soup.select('table td')])

def get_text_for_a_minute_from_199401_to_199512(soup):
    return soup.find('div', id="content").text.strip()

doublespace_pattern = re.compile('\s+')
def remove_doublespaces(document):
    return doublespace_pattern.sub(' ', document).strip()

In [2]:
driver, pagination, largest_page = prepare_resources_for_scraping(chromedriver_filepath, url, start_mmddyyyy, end_mmddyyyy)
minute_url_list, meeting_date_list, document_date_list = scrape_URLs_and_meeting_dates_and_document_dates(driver, pagination, largest_page)

doc_count = 0
error_list = []
for minute_url, meeting_date, document_date in tqdm(zip(minute_url_list, meeting_date_list, document_date_list)):

    # Scrape minutes
    minute_resp = requests.get(minute_url)
    minute_soup = BeautifulSoup(minute_resp.content, 'lxml')

    document_date_yyyymmdd = datetime.strftime(datetime.strptime(document_date, "%B %d, %Y"), "%Y%m%d")
    yearmonth = int(document_date_yyyymmdd[:6])
    try:
        if yearmonth >= 201201:
            doc = get_text_for_a_minute_from_201201_to_202209(minute_soup)
        elif yearmonth >= 200710:
            doc = get_text_for_a_minute_from_200710_to_201112(minute_soup)
        elif yearmonth >= 199601:
            doc = get_text_for_a_minute_from_199601_to_200709(minute_soup)    
        else:
            doc = get_text_for_a_minute_from_199401_to_199512(minute_soup)
    except:
        error_list.append((minute_url, meeting_date, document_date))
        continue

    # Clean
    doc = remove_doublespaces(doc)

    meeting_date_start, meeting_date_end = extract_begin_end_dates(meeting_date)
    meeting_date_start_string = meeting_date_start.strftime("%Y-%m-%d")
    meeting_date_end_string = meeting_date_end.strftime("%Y-%m-%d")

    # Save data
    save_dir = os.path.join(save_root_dir, document_date_yyyymmdd[:4])
    if not os.path.exists(save_dir): os.makedirs(save_dir)
    save_filepath = os.path.join(save_dir, 'MeetingDate={}-{}_UploadedOn={}.txt'\
                                 .format(meeting_date_start_string, meeting_date_end_string, document_date_yyyymmdd))
    with open(save_filepath, "w", encoding='utf-8-sig') as file:
        file.write(doc)
        doc_count += 1

print('Saved {} unique documents under {}'.format(len(glob('{}/*/*.txt'.format(save_root_dir))), save_root_dir)) 

# Save errors
if len(error_list) > 0:
    save_filepath = os.path.join('ScrapingErrors.csv')
    pd.DataFrame(error_list, columns=['url', 'meeting_date', 'document_date']).to_csv(save_filepath, index=False)
    print('Created {}'.format(save_filepath))

Number of URLs: 266


266it [03:14,  1.37it/s]

Saved 254 unique documents under ./Minutes
Created ScrapingErrors.csv



