In [6]:
import re
from urllib.parse import urljoin
from datetime import datetime
import io

import requests
import pandas as pd
from bs4 import BeautifulSoup

# Try to import PDF libraries, install if needed
try:
    import PyPDF2
    PDF_AVAILABLE = True
except ImportError:
    try:
        import pdfplumber
        PDF_AVAILABLE = True
        PDF_LIB = 'pdfplumber'
    except ImportError:
        PDF_AVAILABLE = False
        PDF_LIB = None
        print("Warning: PDF libraries not found. Install with: pip install PyPDF2 or pip install pdfplumber")

BASE = "https://www.federalreserve.gov"
CAL_URL = f"{BASE}/monetarypolicy/fomccalendars.htm"
HISTORICAL_URL = f"{BASE}/monetarypolicy/fomc_historical.htm"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
SESSION = requests.Session()
SESSION.headers.update(HEADERS)

# Date range filter
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2025, 10, 29)

# More flexible rate patterns - ordered from most specific to least specific
RATE_PATTERNS = [
    # Pattern for "raise/lower the target range for the federal funds rate to X to Y percent"
    re.compile(
        r"(?:raise|lower|raised|lowered|set|decided to raise|decided to lower)\s+the\s+target\s+range\s+for\s+the\s+federal\s+funds\s+rate\s+to\s+([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    # Pattern for "target range for the federal funds rate to X to Y percent" (with "to" before the range)
    re.compile(
        r"target\s+range\s+for\s+the\s+federal\s+funds\s+rate\s+to\s+([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    # Original pattern: "target range for the federal funds rate (remains) at X to Y percent"
    re.compile(
        r"target\s+range\s+for\s+the\s+federal\s+funds\s+rate(?: remains)?(?: at)?\s*(?:the )?(?:level of )?\s+([\d\-\s¼½¾/\.]+)\s*to\s*([\d\-\s¼½¾/\.]+)\s*percent",
        re.IGNORECASE,
    ),
    # Pattern: "target range of X to Y percent"
    re.compile(
        r"target\s+range\s+(?:for the federal funds rate\s+)?(?:of\s+)?([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    # Pattern: "federal funds rate at X to Y percent"
    re.compile(
        r"federal\s+funds\s+rate\s+(?:at\s+)?([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    # Pattern for "raised/lowered to X to Y percent" (after federal funds rate)
    re.compile(
        r"(?:raised|lowered|set|maintained).*?federal\s+funds\s+rate.*?to\s+([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
]

def clean_rate(raw):
    """Convert rate string to float, handling fractions and special characters."""
    if not raw or pd.isna(raw):
        return None
    
    repl = str(raw).replace("¼", ".25").replace("½", ".50").replace("¾", ".75")
    repl = repl.replace(" ", "").strip()
    
    # Handle fractional notation like "5-1/4" or "5 1/4"
    if ("-" in repl or " " in repl) and "/" in repl:
        # Try splitting by dash first, then by space
        if "-" in repl:
            parts = repl.split("-")
        else:
            parts = repl.split(" ", 1)
        
        if len(parts) == 2:
            whole = float(parts[0]) if parts[0] else 0
            frac_part = parts[1]
            if "/" in frac_part:
                num, den = frac_part.split("/")
                try:
                    fraction = float(num) / float(den)
                    return whole + fraction
                except (ValueError, ZeroDivisionError):
                    pass
            try:
                return whole + float(frac_part)
            except ValueError:
                pass
    
    # Handle simple fractions like "1/4", "1/2", "3/4"
    if "/" in repl and "-" not in repl and " " not in repl:
        try:
            num, den = repl.split("/")
            return float(num) / float(den)
        except (ValueError, ZeroDivisionError):
            pass
    
    # Handle regular decimals
    try:
        return float(repl)
    except ValueError:
        # Try to extract just numbers
        numbers = re.findall(r'\d+\.?\d*', repl)
        if numbers:
            try:
                return float(numbers[0])
            except ValueError:
                pass
    return None

def extract_text_from_pdf(pdf_url):
    """Extract text from PDF file."""
    if not PDF_AVAILABLE:
        return None
    
    try:
        response = SESSION.get(pdf_url, timeout=30)
        response.raise_for_status()
        
        if PDF_LIB == 'pdfplumber':
            import pdfplumber
            with pdfplumber.open(io.BytesIO(response.content)) as pdf:
                text = "\n".join([page.extract_text() or "" for page in pdf.pages])
            return text
        else:
            # Use PyPDF2
            pdf_file = io.BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = "\n".join([page.extract_text() for page in pdf_reader.pages])
            return text
    except Exception as e:
        print(f"Error reading PDF {pdf_url}: {e}")
        return None

def parse_date(date_str):
    """Parse various date formats to datetime."""
    if pd.isna(date_str) or not date_str:
        return None
    
    # Common date patterns
    patterns = [
        r"([A-Z][a-z]+)\s+(\d{1,2}),?\s+(\d{4})",  # January 29, 2025
        r"(\d{1,2})/(\d{1,2})/(\d{4})",  # 01/29/2025
        r"(\d{4})-(\d{2})-(\d{2})",  # 2025-01-29
    ]
    
    for pattern in patterns:
        match = re.search(pattern, str(date_str))
        if match:
            try:
                if "/" in pattern or "-" in pattern:
                    # Numeric format
                    parts = match.groups()
                    if len(parts) == 3:
                        if "/" in pattern:
                            month, day, year = parts
                        else:
                            year, month, day = parts
                        return datetime(int(year), int(month), int(day))
                else:
                    # Text format
                    month_name, day, year = match.groups()
                    month_map = {
                        'january': 1, 'february': 2, 'march': 3, 'april': 4,
                        'may': 5, 'june': 6, 'july': 7, 'august': 8,
                        'september': 9, 'october': 10, 'november': 11, 'december': 12
                    }
                    month = month_map.get(month_name.lower())
                    if month:
                        return datetime(int(year), month, int(day))
            except (ValueError, KeyError):
                continue
    
    # Try pandas parsing as fallback
    try:
        return pd.to_datetime(date_str, errors='coerce')
    except:
        return None

def fetch_all_statement_links():
    """Fetch all FOMC statement links from calendar and historical pages."""
    all_meetings = []
    seen_urls = set()
    
    # Get links from calendar page
    print("Fetching links from calendar page...")
    try:
        resp = SESSION.get(CAL_URL, timeout=20)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # Find all panels
        blocks = soup.select(".panel.panel-default")
        print(f"Found {len(blocks)} panels on calendar page")
        
        for block in blocks:
            header = block.find("h5") or block.find("h4") or block.find("h3")
            if not header:
                continue
            
            date_text = header.get_text(strip=True)
            
            # Find all links in this block
            for link in block.select("a"):
                href = link.get("href", "")
                text = link.get_text(strip=True).lower()
                
                # Check if it's a statement link (HTML or PDF)
                is_statement = (
                    ("statement" in text and "fomc" in href.lower()) or
                    "pressreleases/monetary" in href.lower() or
                    ("fomcstatements" in href.lower() and ("statement" in text or "htm" in href.lower() or "pdf" in href.lower()))
                )
                
                if is_statement and "calendar" not in href.lower() and "minutes" not in href.lower():
                    full_url = urljoin(BASE, href) if not href.startswith("http") else href
                    if full_url not in seen_urls:
                        seen_urls.add(full_url)
                        all_meetings.append({
                            "meeting_date_str": date_text,
                            "statement_url": full_url,
                            "is_pdf": href.lower().endswith(".pdf")
                        })
    except Exception as e:
        print(f"Error fetching calendar page: {e}")
    
    # Get links from historical page (for older statements)
    print("Fetching links from historical page...")
    try:
        resp = SESSION.get(HISTORICAL_URL, timeout=20)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # Find all links that look like statements
        for link in soup.select("a"):
            href = link.get("href", "")
            text = link.get_text(strip=True)
            
            # Check if it's a statement link
            if ("fomcstatements" in href.lower() or "pressreleases/monetary" in href.lower()) and \
               ("statement" in text.lower() or "htm" in href.lower() or "pdf" in href.lower()):
                
                full_url = urljoin(BASE, href) if not href.startswith("http") else href
                
                if full_url not in seen_urls:
                    # Try to extract date from URL or text
                    date_match = re.search(r"(\d{4})(\d{2})(\d{2})", href)
                    if date_match:
                        year, month, day = date_match.groups()
                        date_text = f"{int(month)}/{int(day)}/{year}"
                    else:
                        date_match = re.search(r"([A-Z][a-z]+ \d{1,2},? \d{4})", text)
                        date_text = date_match.group(1) if date_match else text[:50]
                    
                    seen_urls.add(full_url)
                    all_meetings.append({
                        "meeting_date_str": date_text,
                        "statement_url": full_url,
                        "is_pdf": href.lower().endswith(".pdf")
                    })
    except Exception as e:
        print(f"Error fetching historical page: {e}")
    
    print(f"Total unique statement links found: {len(all_meetings)}")
    return all_meetings

def parse_statement(entry):
    """Parse a single FOMC statement (HTML or PDF) and extract rate information."""
    try:
        url = entry["statement_url"]
        is_pdf = entry.get("is_pdf", False)
        
        # Extract text based on file type
        if is_pdf:
            body_text = extract_text_from_pdf(url)
            if not body_text:
                return None
        else:
            res = SESSION.get(url, timeout=20)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, "html.parser")
            
            # Get release date
            release_date_elem = soup.select_one(".article__date, .releaseDate, time, .col-xs-12.col-sm-8 time")
            if release_date_elem:
                release_date = release_date_elem.get_text(strip=True)
            else:
                # Try to extract from URL
                date_match = re.search(r"(\d{4})(\d{2})(\d{2})", url)
                if date_match:
                    year, month, day = date_match.groups()
                    release_date = f"{int(month)}/{int(day)}/{year}"
                else:
                    release_date = entry["meeting_date_str"]
            
            # Extract body text
            paragraph_nodes = soup.select("div.col-xs-12.col-sm-8 p")
            if not paragraph_nodes:
                paragraph_nodes = soup.select("article p")
            if not paragraph_nodes:
                paragraph_nodes = soup.select("div.panel-body p")
            if not paragraph_nodes:
                paragraph_nodes = soup.select("p")
            
            body_text = " ".join(p.get_text(" ", strip=True) for p in paragraph_nodes)
        
        if not body_text or len(body_text) < 100:
            return None
        
        # Try each rate pattern
        match = None
        for pattern in RATE_PATTERNS:
            match = pattern.search(body_text)
            if match:
                break
        
        if not match:
            return None
        
        # Extract and clean rates
        lower_raw = match.group(1).strip()
        upper_raw = match.group(2).strip()
        
        lower = clean_rate(lower_raw)
        upper = clean_rate(upper_raw)
        
        if lower is None or upper is None:
            return None
        
        # Parse date
        parsed_date = parse_date(release_date)
        
        return {
            "meeting_date": release_date,
            "meeting_date_parsed": parsed_date,
            "statement_url": url,
            "target_lower": lower,
            "target_upper": upper,
            "is_pdf": is_pdf
        }
    except Exception as e:
        print(f"Error parsing {entry['statement_url']}: {e}")
        return None

# Main execution
print("="*60)
print("FOMC Statement Scraper - January 2020 to October 29, 2025")
print("="*60)

# Fetch all links
all_links = fetch_all_statement_links()

# Parse all statements
print(f"\nParsing {len(all_links)} statements...")
rows = []
for i, link in enumerate(all_links, 1):
    if i % 10 == 0:
        print(f"  Processed {i}/{len(all_links)}...")
    parsed = parse_statement(link)
    if parsed:
        rows.append(parsed)

print(f"\nSuccessfully parsed {len(rows)} statements")

# Create DataFrame
if not rows:
    raise ValueError("No statements parsed successfully!")

df = pd.DataFrame(rows)

# Filter by date range
df['meeting_date_parsed'] = pd.to_datetime(df['meeting_date_parsed'], errors='coerce')
df = df[df['meeting_date_parsed'].notna()]
df = df[(df['meeting_date_parsed'] >= START_DATE) & (df['meeting_date_parsed'] <= END_DATE)]

# Sort by date
df = df.sort_values('meeting_date_parsed')

# Calculate rate changes
df['delta_lower'] = df['target_lower'].diff()
df['delta_upper'] = df['target_upper'].diff()

# Create binary column: 1 if rate changed, 0 if it didn't
# Rate changed if either delta_lower or delta_upper is non-zero (and not NaN)
df['rate_changed'] = (
    ((df['delta_lower'].notna()) & (df['delta_lower'] != 0)) | 
    ((df['delta_upper'].notna()) & (df['delta_upper'] != 0))
).astype(int)

print(f"\nFinal dataset: {len(df)} statements from {df['meeting_date_parsed'].min()} to {df['meeting_date_parsed'].max()}")
print(f"\nPDF statements: {df['is_pdf'].sum()}")
print(f"HTML statements: {(~df['is_pdf']).sum()}")
print(f"Rate changes: {df['rate_changed'].sum()} out of {len(df)} meetings")

df.head(20)

FOMC Statement Scraper - January 2020 to October 29, 2025
Fetching links from calendar page...
Found 8 panels on calendar page
Fetching links from historical page...
Total unique statement links found: 107

Parsing 107 statements...
  Processed 10/107...
  Processed 20/107...
  Processed 30/107...
  Processed 40/107...
  Processed 50/107...
  Processed 60/107...
  Processed 70/107...
  Processed 80/107...
  Processed 90/107...
  Processed 100/107...

Successfully parsed 45 statements

Final dataset: 45 statements from 2020-01-29 00:00:00 to 2025-07-30 00:00:00

PDF statements: 0
HTML statements: 45
Rate changes: 14 out of 45 meetings


Unnamed: 0,meeting_date,meeting_date_parsed,statement_url,target_lower,target_upper,is_pdf,delta_lower,delta_upper,rate_changed
34,1/29/2020,2020-01-29,https://www.federalreserve.gov/newsevents/pres...,1.5,1.75,False,,,0
35,3/3/2020,2020-03-03,https://www.federalreserve.gov/newsevents/pres...,1.0,1.25,False,-0.5,-0.5,1
37,3/15/2020,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,-1.0,-1.0,1
36,3/15/2020,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
38,3/23/2020,2020-03-23,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
39,4/29/2020,2020-04-29,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
40,6/10/2020,2020-06-10,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
41,7/29/2020,2020-07-29,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
42,9/16/2020,2020-09-16,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
43,11/5/2020,2020-11-05,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0


In [10]:
# Filter DataFrame to show only rows where date includes 2022
df[df['meeting_date_parsed'].dt.year == 2022]


Unnamed: 0,meeting_date,meeting_date_parsed,statement_url,target_lower,target_upper,is_pdf,delta_lower,delta_upper,rate_changed
19,1/26/2022,2022-01-26,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
20,3/16/2022,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,0.25,0.5,False,0.25,0.25,1
21,5/4/2022,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,0.75,1.0,False,0.5,0.5,1
22,7/27/2022,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,2.25,2.5,False,1.5,1.5,1
23,9/21/2022,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,3.0,3.25,False,0.75,0.75,1
24,11/2/2022,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,3.75,4.0,False,0.75,0.75,1
25,12/14/2022,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,4.25,4.5,False,0.5,0.5,1


In [11]:
# Debug: Test parsing for March 16, 2022 statement
test_text = "In support of these goals, the Committee decided to raise the target range for the federal funds rate to 1/4 to 1/2 percent"

print("Testing rate extraction from sample text:")
print(f"Text: {test_text}\n")

match = None
for i, pattern in enumerate(RATE_PATTERNS, 1):
    match = pattern.search(test_text)
    if match:
        print(f"✓ Pattern {i} MATCHED!")
        print(f"  Lower: '{match.group(1)}'")
        print(f"  Upper: '{match.group(2)}'")
        
        lower = clean_rate(match.group(1))
        upper = clean_rate(match.group(2))
        print(f"  Parsed Lower: {lower}")
        print(f"  Parsed Upper: {upper}")
        break

if not match:
    print("✗ No pattern matched!")
    
# Also check if March 16, 2022 is in the dataframe
print("\n" + "="*60)
print("Checking for March 16, 2022 in dataframe:")
march_2022 = df[df['meeting_date_parsed'].dt.strftime('%Y-%m-%d') == '2022-03-16']
if len(march_2022) > 0:
    print("Found March 16, 2022 statement:")
    print(march_2022[['meeting_date_parsed', 'target_lower', 'target_upper', 'rate_changed']])
else:
    print("March 16, 2022 NOT found in dataframe")
    print("\nAll 2022 statements:")
    print(df[df['meeting_date_parsed'].dt.year == 2022][['meeting_date_parsed', 'target_lower', 'target_upper', 'rate_changed']])


Testing rate extraction from sample text:
Text: In support of these goals, the Committee decided to raise the target range for the federal funds rate to 1/4 to 1/2 percent

✓ Pattern 1 MATCHED!
  Lower: '1/4'
  Upper: '1/2'
  Parsed Lower: 0.25
  Parsed Upper: 0.5

Checking for March 16, 2022 in dataframe:
Found March 16, 2022 statement:
   meeting_date_parsed  target_lower  target_upper  rate_changed
20          2022-03-16          0.25           0.5             1


In [12]:
df[df['rate_changed'] == 1]

Unnamed: 0,meeting_date,meeting_date_parsed,statement_url,target_lower,target_upper,is_pdf,delta_lower,delta_upper,rate_changed
35,3/3/2020,2020-03-03,https://www.federalreserve.gov/newsevents/pres...,1.0,1.25,False,-0.5,-0.5,1
37,3/15/2020,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,-1.0,-1.0,1
20,3/16/2022,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,0.25,0.5,False,0.25,0.25,1
21,5/4/2022,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,0.75,1.0,False,0.5,0.5,1
22,7/27/2022,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,2.25,2.5,False,1.5,1.5,1
23,9/21/2022,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,3.0,3.25,False,0.75,0.75,1
24,11/2/2022,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,3.75,4.0,False,0.75,0.75,1
25,12/14/2022,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,4.25,4.5,False,0.5,0.5,1
11,2/1/2023,2023-02-01,https://www.federalreserve.gov/newsevents/pres...,4.5,4.75,False,0.25,0.25,1
12,3/22/2023,2023-03-22,https://www.federalreserve.gov/newsevents/pres...,4.75,5.0,False,0.25,0.25,1


In [19]:
# Filter dataframe to show only meeting dates where interest rate changed
df[df['rate_changed'] == 1][['meeting_date_parsed']].to_csv('FOMC_Dates_with_Rate_Changes.csv', index=False)
