In [None]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
import pandas as pd
import datetime
import pytz

def federal_reserve_datetime_conversion(date_string, time_string):
    """
    Convert date and time string from Federal Reserve press release format to UTC datetime object
    eg.. date_string = 'January 06, 2021'
         time_string = '2:00 p.m. EST'
    """

    time_string = time_string.split(' ')
    if time_string[1] == 'p.m.':
        time_string[0] = str(int(time_string[0].split(':')[0]) + 12) + ':' + time_string[0].split(':')[1]
    if time_string[2] == 'EST':
        time_string[0] += '-0500' 

    time_string = time_string[0]

    # Parse the time string into a time object
    time_obj = datetime.datetime.strptime(time_string[:-6], '%H:%M')

    # Get the current date in Eastern time zone
    timezone = pytz.timezone('US/Eastern')
    date_obj = datetime.datetime.strptime(date_string, '%B %d, %Y').date()

    # Combine the date and time objects
    datetime_obj = timezone.localize(datetime.datetime.combine(date_obj, time_obj.time()))

    # Convert to UTC
    utc_obj = datetime_obj.astimezone(pytz.utc)

    # Print the UTC datetime object in ISO format without +00:00
    print(str(utc_obj)[:-6])

federal_reserve_datetime_conversion('January 06, 2021', '2:00 p.m. EST')


In [None]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
import pandas as pd
from datetime import datetime

df_columns = ['date', 'url', 'title', 'text', 'summary', 'source', 'tags', 'type', 'language', 'country']

parse_urls = [
    {   'url': 'https://www.federalreserve.gov/newsevents/pressreleases/',
        'source': 'Federal Reserve',
        'tags': 'general',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
        ]

url_parsed_dict = dict()
regulatory_news_df = pd.DataFrame()
debugging = False

for parse_url in parse_urls:

    if parse_url['source'] == 'Federal Reserve':

        i = 1
        while True:
            response = requests.get(parse_url['url'] + str(i).zfill(3) + '.html')
            if response.status_code != 200:
                if debugging:
                    print(f'pr21{str(i).zfill(3)}.html not found. Breaking loop.')
                break

            i += 1
            processed_row = dict() 
            soup = BeautifulSoup(response.content, 'html.parser')

            date_string = soup.find('span', class_='prdate').text.strip()
            if len(date_string.split(',')) == 3:
                processed_row['date'] = datetime.strptime(date_string, '%A, %B %d, %Y').strftime('%Y-%m-%d %H:%M:%S')
            elif len(date_string.split(',')) == 2:
                processed_row['date'] = datetime.strptime(date_string, '%B %d, %Y').strftime('%Y-%m-%d %H:%M:%S')
            processed_row['url'] = response.url
            if processed_row['url'] not in url_parsed_dict:
                url_parsed_dict[processed_row['url']] = True
            else:
                continue
            processed_row['title'] = soup.find_all('div', class_='prtitle')[0].find('h1').text.strip()
            
            processed_row['text'] = ''
            for paragraph in soup.find('article', class_='order-2').find_all('p'):
                processed_row['text'] += paragraph.text.strip() + '\n'
            processed_row['summary'] = ''
            processed_row['source'] = parse_url['source']
            processed_row['tags'] = parse_url['tags']
            processed_row['type'] = parse_url['type']
            processed_row['language'] = parse_url['language']
            processed_row['country'] = parse_url['country']

            if debugging:
                print(processed_row)

            processed_row_df = pd.DataFrame(processed_row, index=[0])
            regulatory_news_df = pd.concat([regulatory_news_df, processed_row_df], ignore_index=True, axis=0)

regulatory_news_df

In [None]:
base_url = 'https://www.federalreserve.gov/newsevents/pressreleases/'
categories = ['monetary', 'other', 'orders', 'bcreg', 'enforcement']
start_date = datetime.date(2021, 1, 1)
end_date = datetime.date(2023, 1, 1)
for category in categories:
    print(f"Category: {category}")
    for n in range(int((end_date - start_date).days)):

        char_append = 'a'
        while True:
            reponse = requests.get(base_url + category + (start_date + datetime.timedelta(n)).strftime('%Y%m%d') + char_append + '.htm')
            # print(reponse.url)
            if reponse.status_code != 200:
                break

            # increment char_append
            char_append = chr(ord(char_append) + 1)

            
            print(f"Valid: {reponse.url}")
            break
            


In [None]:
d = '2:00 p.m. EST'
d = d.split(' ')
if d[1] == 'p.m.':
    d[0] = str(int(d[0].split(':')[0]) + 12) + ':' + d[0].split(':')[1]
if d[2] == 'EST':
    d[0] += '-0500' 
d

In [None]:
def federal_reserve_datetime_conversion(date_string, time_string):
    """
    Convert date and time string from Federal Reserve press release format to UTC datetime object
    eg.. date_string = 'January 06, 2021'
         time_string = '2:00 p.m. EST'
    """

    time_string = time_string.split(' ')
    if time_string[1] == 'p.m.':
        time_string[0] = str(int(time_string[0].split(':')[0]) + 12) + ':' + time_string[0].split(':')[1]
    if time_string[2] == 'EST':
        time_string[0] += '-0500' 

    time_string = time_string[0]

    # Parse the time string into a time object
    time_obj = datetime.datetime.strptime(time_string[:-6], '%H:%M')

    # Get the current date in Eastern time zone
    timezone = pytz.timezone('US/Eastern')
    date_obj = datetime.datetime.strptime(date_string, '%B %d, %Y').date()

    # Combine the date and time objects
    datetime_obj = timezone.localize(datetime.datetime.combine(date_obj, time_obj.time()))

    # Convert to UTC
    utc_obj = datetime_obj.astimezone(pytz.utc)

    # Print the UTC datetime object in ISO format without +00:00
    print(str(utc_obj)[:-6])

federal_reserve_datetime_conversion('January 06, 2021', '2:00 p.m. EST')


In [None]:
# convert January 06, 2021 to 2021-01-06
datetime.strptime('January 06, 2021', '%B %d, %Y').date()


In [None]:
response = requests.get('https://www.federalreserve.gov/newsevents/pressreleases/monetary20210106a.htm')

soup = BeautifulSoup(response.content, 'html.parser')
d = soup.find('p', class_='article__time').text.strip() + 'T' + soup.find('p', class_='releaseTime').text.split('at')[-1].split('ST')[0].strip() + 'ST'
# d = datetime.datetime.strptime(d, '%B %d, %Y %I:%M %p %Z')
d

In [None]:
soup.find('p', class_='releaseTime')

In [None]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
import pandas as pd
import datetime

parse_urls = [
    {   'url': "https://www.fdic.gov/news/press-releases/2021/pr21",
        'source': 'FDIC',
        'tags': 'general',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
    {   'url': "https://www.fdic.gov/news/press-releases/2022/pr22",
        'source': 'FDIC',
        'tags': 'general',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
    {   'url': "https://www.sec.gov/news/pressreleases?aId=&combine=crypto&year=2021&month=All",
        'source': 'SEC',
        'tags': 'crypto',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
    {   'url': "https://www.sec.gov/news/pressreleases?aId=&combine=&year=2021&month=All",
        'source': 'SEC',
        'tags': 'general',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
    {   'url': "https://www.sec.gov/news/pressreleases?aId=&combine=crypto&year=2022&month=All",
        'source': 'SEC',
        'tags': 'crypto',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
    {   'url': "https://www.sec.gov/news/pressreleases?aId=&combine=&year=2022&month=All",
        'source': 'SEC',
        'tags': 'general',
        'type': 'press release',
        'language': 'en',
        'country': 'US'
    },
        ]

url_parsed_dict = dict()
regulatory_news_df = pd.DataFrame()
debugging = False

for parse_url in parse_urls:

    if parse_url['source'] == 'FDIC':

        i = 1
        while True:
            response = requests.get(parse_url['url'] + str(i).zfill(3) + '.html')
            if response.status_code != 200:
                if debugging:
                    print(f'pr21{str(i).zfill(3)}.html not found. Breaking loop.')
                break

            i += 1
            processed_row = dict() 
            soup = BeautifulSoup(response.content, 'html.parser')

            date_string = soup.find('span', class_='prdate').text.strip()
            if len(date_string.split(',')) == 3:
                processed_row['date'] = datetime.datetime.strptime(date_string, '%A, %B %d, %Y').strftime('%Y-%m-%d %H:%M:%S')
            elif len(date_string.split(',')) == 2:
                processed_row['date'] = datetime.datetime.strptime(date_string, '%B %d, %Y').strftime('%Y-%m-%d %H:%M:%S')
            processed_row['url'] = response.url
            if processed_row['url'] not in url_parsed_dict:
                url_parsed_dict[processed_row['url']] = True
            else:
                continue
            processed_row['title'] = soup.find_all('div', class_='prtitle')[0].find('h1').text.strip()
            
            processed_row['text'] = ''
            for paragraph in soup.find('article', class_='order-2').find_all('p'):
                processed_row['text'] += paragraph.text.strip() + '\n'
            processed_row['summary'] = ''
            processed_row['source'] = parse_url['source']
            processed_row['tags'] = parse_url['tags']
            processed_row['type'] = parse_url['type']
            processed_row['language'] = parse_url['language']
            processed_row['country'] = parse_url['country']

            if debugging:
                print(processed_row)

            processed_row_df = pd.DataFrame(processed_row, index=[0])
            regulatory_news_df = pd.concat([regulatory_news_df, processed_row_df], ignore_index=True, axis=0)

    if parse_url['source'] == 'SEC':

        # continue

        response = requests.get(parse_url['url'])
        # print(response.status_code)
        soup = BeautifulSoup(response.content, 'html.parser')

        articles = soup.find_all('tr', class_='pr-list-page-row')

        for article in reversed(articles):


            processed_row = dict() 
            row = article.find_all('td')

            # 1. Time
            processed_row['date'] = parser.parse(row[1].find('time')['datetime']).strftime('%Y-%m-%d %H:%M:%S')
            # 2. URL
            processed_row['url'] = "https://www.sec.gov" + row[2].find('a')['href']
            if processed_row['url'] not in url_parsed_dict:
                url_parsed_dict[processed_row['url']] = True
            else:
                continue
            # 3. Title
            processed_row['title'] = row[2].find('a').text.strip()
            # 4. Text
            article_soup = BeautifulSoup(requests.get(processed_row['url']).content, 'html.parser')
            processed_row['text'] = article_soup.find('div', class_='article-body').text.strip()
            # 5. Summary
            processed_row['summary'] = ''
            # 6. Source
            processed_row['source'] = parse_url['source']
            # 7. Tags
            processed_row['tags'] = parse_url['tags']
            # 8. Type
            processed_row['type'] = parse_url['type']
            # 9. Language
            processed_row['language'] = parse_url['language']
            # 10. Country
            processed_row['country'] = parse_url['country']

            if debugging:
                print(processed_row)

            processed_row_df = pd.DataFrame(processed_row, index=[0])
            regulatory_news_df = pd.concat([regulatory_news_df, processed_row_df], ignore_index=True, axis=0)

regulatory_news_df


In [None]:
regulatory_news_df.columns

SEC = 507 rows \
FDIC = 197 rows

In [None]:
regulatory_news_df.to_csv('regulatory_news.csv', index=False)

In [None]:
'https://www.federalreserve.gov/newsevents/pressreleases/'