# Scrape Federal Reserve calendar events

This notebook scrapes the Fed's calendar for June through September. Events include speeches, FOMC meetings, testimonies, Beige Book, and statistical releases. Each event has a daily counter. I would have generalized this process into a function/class, however, in the interest of time, I was relegated to simply copying and pasting each month's scraping process. Additionally, no two months were the same in the composition of their events, which made it even harder to generalize.

### Import relevant libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import re

from collections import Counter
import itertools
import string

### Get urls for each month (federalreserve.gov/newsevents)

In [None]:
months = ['june', 'july', 'august', 'september']

urls = []
for m in months:
    urls.append('https://www.federalreserve.gov/newsevents/2017-'+ m + '.htm')

The process for each month is fairly straightforward and uniform. The body of the page is converted to text and then each div tag is iterated through. Every instance of a subheading adds to a counter, which adds the subsequent events to their repsective lists. From here, each list is converted to a pandas dataframe and all of these are merged into one monthly dataframe of all events.

## September

In [None]:
month = 'September'
url = 'https://www.federalreserve.gov/newsevents/2017-september.htm'

soup = BeautifulSoup(requests.get(url).text, 'html.parser')
textlist = []
for i in soup.find(class_='col-xs-12 col-sm-8 col-md-8').find_all(class_='row'):
    textlist.append(i.text.replace('\n', ''))


page = [month, [], [], [], []]
i = 0
for ind, x in enumerate(textlist):
    x = x.replace('.', '').split('-')

    if x[0] in ['Speeches ', 'FOMC Meetings ', 'Beige Book ', 'Statistical Releases ', 'Other ']:
        i += 1

    elif i > 4:
        break

    elif 'Time' in x[0]:
        pass

    else:
        y = re.findall(r'\d*', x[-1])
        y = [j for j in y if j.isdigit()]
        if ind%2 == 0:
            if i < 4:
                page[i].append(y[-1])
            else:
                page[i].append(y)

page[4] = list(itertools.chain.from_iterable(page[4]))

speeches = []
date = []
count = Counter(page[1])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    speeches.append(v)
df_speeches = pd.DataFrame({'date': date, 'speeches': speeches})

fomc = []
date = []
count = Counter(page[2])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    fomc.append(v)
df_fomc = pd.DataFrame({'date': date, 'fomc': fomc})

beige = []
date = []
count = Counter(page[3])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    beige.append(v)
df_beige = pd.DataFrame({'date': date, 'beige': beige})


releases = []
date = []
count = Counter(page[4])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    releases.append(v)
df_releases = pd.DataFrame({'date': date, 'releases': releases})

dfs = [df_speeches, df_fomc, df_beige, df_releases]

df = pd.concat(dfs).groupby('date').sum().fillna(0).reset_index()
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%B %d %Y'))

df = df.set_index('date')

dflist.append(df)

## August

In [None]:
month = 'August'
url = 'https://www.federalreserve.gov/newsevents/2017-august.htm'

soup = BeautifulSoup(requests.get(url).text, 'html.parser')
textlist = []
for i in soup.find(class_='col-xs-12 col-sm-8 col-md-8').find_all(class_='row'):
    textlist.append(i.text.replace('\n', ''))


page = [month, [], [], []]
i = 0
for ind, x in enumerate(textlist):
    x = x.replace('.', '').split('-')

    if x[0] in ['Speeches ', 'FOMC Meetings ', 'Beige Book ', 'Statistical Releases ', 'Other ']:
        i += 1

    elif i > 3:
        break

    elif 'Time' in x[0]:
        pass

    else:
        y = re.findall(r'\d*', x[-1])
        y = [j for j in y if j.isdigit()]
        if ind%2 == 0:
            if i < 3:
                page[i].append(y[0])
            else:
                page[i].append(y)

page[3] = list(itertools.chain.from_iterable(page[3]))

speeches = []
date = []
count = Counter(page[1])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    speeches.append(v)
df_speeches = pd.DataFrame({'date': date, 'speeches': speeches})

fomc = []
date = []
count = Counter(page[2])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    fomc.append(v)
df_fomc = pd.DataFrame({'date': date, 'fomc': fomc})

releases = []
date = []
count = Counter(page[3])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    releases.append(v)
df_releases = pd.DataFrame({'date': date, 'releases': releases})

dfs = [df_speeches, df_fomc, df_releases]

df = pd.concat(dfs).groupby('date').sum().fillna(0).reset_index()
df.date.replace({'August 201716 2017': 'August 16 2017'}, inplace=True)
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%B %d %Y'))

df = df.set_index('date')

dflist.append(df)

## July

In [None]:
month = 'July'
url = 'https://www.federalreserve.gov/newsevents/2017-july.htm'

soup = BeautifulSoup(requests.get(url).text, 'html.parser')
textlist = []
for i in soup.find(class_='col-xs-12 col-sm-8 col-md-8').find_all(class_='row'):
    textlist.append(i.text.replace('\n', ''))


page = [month, [], [], [], [], []]
i = 0
for ind, x in enumerate(textlist):
    x = x.replace('.', '').split('-')

    if x[0] in ['Speeches ', 'Testimony ', 'FOMC Meetings ', 'Beige Book ', 'Statistical Releases ', 'Other ']:
        i += 1

    elif i > 5:
        break

    elif 'Time' in x[0]:
        pass

    else:
        y = re.findall(r'\d*', x[-1])
        y = [j for j in y if j.isdigit()]
        if ind%2 == 0:
            if i < 5:
                page[i].append(y[-1])
            else:
                page[i].append(y)

                
page[5] = list(itertools.chain.from_iterable(page[5]))


speeches = []
date = []
count = Counter(page[1])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    speeches.append(v)
df_speeches = pd.DataFrame({'date': date, 'speeches': speeches})

fomc = []
date = []
count = Counter(page[2])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    fomc.append(v)
df_fomc = pd.DataFrame({'date': date, 'fomc': fomc})

beige = []
date = []
count = Counter(page[3])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    beige.append(v)
df_beige = pd.DataFrame({'date': date, 'beige': beige})

testimonies = []
date = []
count = Counter(page[4])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    testimonies.append(v)
df_testimonies = pd.DataFrame({'date': date, 'testimonies': testimonies})


releases = []
date = []
count = Counter(page[5])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    releases.append(v)
df_releases = pd.DataFrame({'date': date, 'releases': releases})

dfs = [df_speeches, df_fomc, df_beige, df_testimonies, df_releases]

df = pd.concat(dfs).groupby('date').sum().fillna(0).reset_index()
df.date.replace({'July 20175 2017': 'July 5 2017', 'July 2626 2017': 'July 26 2017'}, inplace=True)
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%B %d %Y'))

df = df.set_index('date')

dflist.append(df)

## June

In [None]:
month = 'June'
url = 'https://www.federalreserve.gov/newsevents/2017-june.htm'

soup = BeautifulSoup(requests.get(url).text, 'html.parser')
textlist = []
for i in soup.find(class_='col-xs-12 col-sm-8 col-md-8').find_all(class_='row'):
    textlist.append(i.text.replace('\n', ''))


page = [month, [], [], [], []]
i = 0
for ind, x in enumerate(textlist):
    x = x.replace('.', '').split('-')

    if x[0] in ['Speeches ', 'Testimony ', 'FOMC Meetings ', 'Beige Book ', 'Statistical Releases ', 'Other ']:
        i += 1

    elif i > 4:
        break

    elif 'Time' in x[0]:
        pass

    else:
        y = re.findall(r'\d*', x[-1])
        y = [j for j in y if j.isdigit()]
        if ind%2 == 1:
            if i < 4:
                page[i].append(y[-1])
            else:
                page[i].append(y)
                
page[4] = list(itertools.chain.from_iterable(page[4]))

speeches = []
date = []
count = Counter(page[1])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    speeches.append(v)
df_speeches = pd.DataFrame({'date': date, 'speeches': speeches})

testimonies = []
date = []
count = Counter(page[2])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    testimonies.append(v)
df_testimonies = pd.DataFrame({'date': date, 'testimonies': testimonies})

fomc = []
date = []
count = Counter(page[3])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    fomc.append(v)
df_fomc = pd.DataFrame({'date': date, 'fomc': fomc})


releases = []
date = []
count = Counter(page[4])
for k,v in count.items():
    date.append(month + ' ' + k + ' 2017')
    releases.append(v)
df_releases = pd.DataFrame({'date': date, 'releases': releases})

dfs = [df_speeches, df_testimonies, df_fomc, df_releases]

df = pd.concat(dfs).groupby('date').sum().fillna(0).reset_index()
df.date.replace({'August 201716 2017': 'August 16 2017'}, inplace=True)
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%B %d %Y'))

df = df.set_index('date')

dflist.append(df)

In [None]:
calendar = pd.concat(dflist).fillna(0)

### Save calendar data

In [None]:
import pickle

with open('/Users/samfunk/ds/metis/project_luther/calendar.pkl', 'wb') as f:
    pickle.dump(calendar, f)