In [6]:
# === FOMC STATEMENT SCRAPER: AUTOMATED DATA COLLECTION ===
# This script performs web scraping to extract historical Federal Reserve data
# Purpose: Automate collection of FOMC meeting dates and interest rate decisions
# Alternative: Manual data entry (error-prone, time-consuming, not reproducible)

# LIBRARY IMPORTS:
# re: Regular expression module for pattern matching in text (interest rate extraction)
import re
# urllib.parse.urljoin: Handles relative URL resolution (converts href="/path" to full URL)
from urllib.parse import urljoin
# datetime: Date parsing and manipulation for temporal data handling
from datetime import datetime
# io: In-memory binary streams for PDF processing (avoids disk I/O)
import io

# WEB SCRAPING STACK:
# requests: HTTP library for fetching web pages (simpler than urllib, handles sessions)
import requests
# pandas: Data structure for organizing scraped data into tabular format
import pandas as pd
# BeautifulSoup: HTML/XML parser for navigating DOM tree and extracting data
#   - Handles malformed HTML gracefully (unlike strict XML parsers)
#   - Provides Pythonic API for tree traversal
from bs4 import BeautifulSoup

# PDF PARSING LIBRARIES (OPTIONAL):
# FOMC statements exist in both HTML and PDF formats
# We attempt to import PDF libraries with fallback handling
try:
    # PyPDF2: Pure-Python PDF library, lightweight but less robust
    import PyPDF2
    PDF_AVAILABLE = True
except ImportError:
    try:
        # pdfplumber: More sophisticated PDF extraction (handles tables, layouts better)
        # Built on top of pdfminer.six, better text extraction accuracy
        import pdfplumber
        PDF_AVAILABLE = True
        PDF_LIB = 'pdfplumber'
    except ImportError:
        # Graceful degradation: Script continues without PDF support
        # Only HTML statements will be processed (still captures majority of data)
        PDF_AVAILABLE = False
        PDF_LIB = None
        print("Warning: PDF libraries not found. Install with: pip install PyPDF2 or pip install pdfplumber")

# === SCRAPING CONFIGURATION ===
# WEB SCRAPING BEST PRACTICES:
# 1. Use specific User-Agent to identify bot (ethical scraping)
# 2. Respect robots.txt (Federal Reserve allows this)
# 3. Use sessions to maintain connection pooling (efficiency)

# Base URL for Federal Reserve website
BASE = "https://www.federalreserve.gov"

# TARGET ENDPOINTS:
# Calendar page: Contains recent/upcoming FOMC meetings
CAL_URL = f"{BASE}/monetarypolicy/fomccalendars.htm"
# Historical page: Archive of past FOMC statements (2000s-2020s)
HISTORICAL_URL = f"{BASE}/monetarypolicy/fomc_historical.htm"

# HTTP HEADERS:
# User-Agent: Identifies our scraper to server (prevents blocking)
# Many websites block requests with default python-requests user agent
# Using realistic browser string improves success rate
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

# SESSION OBJECT:
# requests.Session() provides:
#   - Connection pooling (reuses TCP connections, faster)
#   - Cookie persistence across requests
#   - Header defaults applied to all requests
SESSION = requests.Session()
SESSION.headers.update(HEADERS)

# TEMPORAL FILTERING:
# Restrict data collection to analysis timeframe (2020-2025)
# Matches the Bitcoin price data range in main analysis
# datetime objects enable robust date comparisons
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2025, 10, 29)

# === REGULAR EXPRESSION PATTERNS FOR RATE EXTRACTION ===
# NATURAL LANGUAGE PROCESSING CHALLENGE:
# FOMC statements are written in formal English with varying phrasings
# We need to extract structured data (interest rates) from unstructured text
# Solution: Multiple regex patterns ordered by specificity (waterfall matching)

# REGEX DESIGN PRINCIPLES:
# 1. Most specific patterns first (avoid false positives)
# 2. Case-insensitive matching (FOMC writing style varies)
# 3. Flexible whitespace handling (\s+ matches 1+ spaces/newlines)
# 4. Capture groups () to extract rate values

# CHARACTER CLASS: [\d\-\s¼½¾/\.]+ matches:
#   \d: digits 0-9
#   \-: dash (for formats like "5-1/4")
#   \s: whitespace
#   ¼½¾: Unicode fraction characters (FOMC uses these)
#   /: slash (for fractional notation like "1/4")
#   \.: decimal point
#   +: one or more occurrences (greedy quantifier)

RATE_PATTERNS = [
    # PATTERN 1: Action verb + rate specification
    # Example: "decided to raise the target range for the federal funds rate to 0.25 to 0.50 percent"
    # (?:...) is non-capturing group (matches but doesn't create capture group)
    # Captures: Two rate values (lower and upper bounds of range)
    re.compile(
        r"(?:raise|lower|raised|lowered|set|decided to raise|decided to lower)\s+the\s+target\s+range\s+for\s+the\s+federal\s+funds\s+rate\s+to\s+([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    
    # PATTERN 2: Target range specification with "to" before values
    # Example: "target range for the federal funds rate to 1/4 to 1/2 percent"
    # This pattern catches statements that omit action verbs
    re.compile(
        r"target\s+range\s+for\s+the\s+federal\s+funds\s+rate\s+to\s+([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    
    # PATTERN 3: Status quo statements (rate remains at X to Y)
    # Example: "target range for the federal funds rate remains at 0 to 1/4 percent"
    # (?:remains)? makes "remains" optional (present in no-change statements)
    # \s* allows zero or more spaces (flexible spacing)
    re.compile(
        r"target\s+range\s+for\s+the\s+federal\s+funds\s+rate(?: remains)?(?: at)?\s*(?:the )?(?:level of )?\s+([\d\-\s¼½¾/\.]+)\s*to\s*([\d\-\s¼½¾/\.]+)\s*percent",
        re.IGNORECASE,
    ),
    
    # PATTERN 4: Abbreviated form (target range of X to Y)
    # Example: "target range of 2.25 to 2.50 percent"
    # More concise phrasing, often in summary sections
    re.compile(
        r"target\s+range\s+(?:for the federal funds rate\s+)?(?:of\s+)?([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    
    # PATTERN 5: Simple rate statement (federal funds rate at X to Y)
    # Example: "federal funds rate at 4.50 to 4.75 percent"
    # Catches minimal phrasings
    re.compile(
        r"federal\s+funds\s+rate\s+(?:at\s+)?([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
    
    # PATTERN 6: Action with indirect specification (raised...to)
    # Example: "raised...federal funds rate...to 3.00 to 3.25 percent"
    # .*? is non-greedy match (matches minimum text between keywords)
    # Catches sentences with intervening clauses
    re.compile(
        r"(?:raised|lowered|set|maintained).*?federal\s+funds\s+rate.*?to\s+([\d\-\s¼½¾/\.]+)\s+to\s+([\d\-\s¼½¾/\.]+)\s+percent",
        re.IGNORECASE,
    ),
]

def clean_rate(raw):
    """
    Convert extracted rate string to float, handling diverse numerical formats.
    
    DATA QUALITY CHALLENGE:
    FOMC statements use inconsistent number formatting:
    - Decimal: "0.25", "4.50"
    - Fractions: "1/4", "1/2", "3/4"
    - Mixed: "5-1/4" (5.25), "5 1/4"
    - Unicode: "¼", "½", "¾"
    
    This function normalizes all formats to Python float for analysis.
    """
    # NULL HANDLING: Return None for missing/invalid data
    if not raw or pd.isna(raw):
        return None
    
    # UNICODE NORMALIZATION:
    # Replace special fraction characters with decimal equivalents
    # Common in Federal Reserve formal documents
    repl = str(raw).replace("¼", ".25").replace("½", ".50").replace("¾", ".75")
    repl = repl.replace(" ", "").strip()  # Remove whitespace
    
    # CASE 1: MIXED NUMBER FORMAT (whole + fraction)
    # Examples: "5-1/4" → 5.25, "5 1/4" → 5.25
    # Algorithm: Parse whole number, parse fraction, add together
    if ("-" in repl or " " in repl) and "/" in repl:
        # Split on delimiter (dash or space)
        if "-" in repl:
            parts = repl.split("-")
        else:
            parts = repl.split(" ", 1)
        
        if len(parts) == 2:
            # Extract whole number part
            whole = float(parts[0]) if parts[0] else 0
            frac_part = parts[1]
            
            # Parse fractional part (numerator/denominator)
            if "/" in frac_part:
                num, den = frac_part.split("/")
                try:
                    fraction = float(num) / float(den)
                    return whole + fraction  # Combine: 5 + 0.25 = 5.25
                except (ValueError, ZeroDivisionError):
                    pass  # Fall through to next parsing attempt
            
            # Fallback: Treat second part as decimal
            try:
                return whole + float(frac_part)
            except ValueError:
                pass
    
    # CASE 2: SIMPLE FRACTION FORMAT (no whole number)
    # Examples: "1/4" → 0.25, "1/2" → 0.50, "3/4" → 0.75
    if "/" in repl and "-" not in repl and " " not in repl:
        try:
            num, den = repl.split("/")
            return float(num) / float(den)
        except (ValueError, ZeroDivisionError):
            pass
    
    # CASE 3: STANDARD DECIMAL FORMAT
    # Examples: "0.25", "4.50", "5.25"
    # This is the simplest and most common case
    try:
        return float(repl)
    except ValueError:
        # FALLBACK: Extract first number sequence if direct parsing fails
        # Regex: \d+\.?\d* matches integer or decimal (e.g., "123" or "12.34")
        numbers = re.findall(r'\d+\.?\d*', repl)
        if numbers:
            try:
                return float(numbers[0])
            except ValueError:
                pass
    
    # PARSING FAILED: Return None (will be filtered out in data cleaning)
    return None

def extract_text_from_pdf(pdf_url):
    """
    Extract plain text from PDF file hosted on Federal Reserve website.
    
    PDF PROCESSING WORKFLOW:
    1. Download PDF binary data via HTTP
    2. Load into memory (io.BytesIO) - avoids disk I/O
    3. Parse PDF structure to extract text
    4. Concatenate all pages into single string
    
    WHY PDF EXTRACTION IS COMPLEX:
    - PDFs store text in arbitrary coordinates (not sequential)
    - Requires layout analysis to reconstruct reading order
    - May contain images, embedded fonts, complex layouts
    """
    # EARLY RETURN: Skip if PDF libraries not available
    # Graceful degradation - script continues with HTML-only data
    if not PDF_AVAILABLE:
        return None
    
    try:
        # HTTP REQUEST: Download PDF binary content
        # timeout=30: Prevent hanging on slow connections (networking best practice)
        response = SESSION.get(pdf_url, timeout=30)
        # raise_for_status(): Raise exception for 4xx/5xx HTTP errors
        response.raise_for_status()
        
        # LIBRARY-SPECIFIC PARSING:
        # We support two PDF libraries with different trade-offs
        if PDF_LIB == 'pdfplumber':
            # pdfplumber: Better text extraction accuracy, handles complex layouts
            # Slower but more robust for government documents
            import pdfplumber
            # BytesIO: In-memory binary stream (no temporary file needed)
            # Context manager (with) ensures proper resource cleanup
            with pdfplumber.open(io.BytesIO(response.content)) as pdf:
                # Extract text from each page, filter None values (empty pages)
                # Join with newlines to preserve document structure
                text = "\n".join([page.extract_text() or "" for page in pdf.pages])
            return text
        else:
            # PyPDF2: Faster, pure Python, lightweight
            # Less accurate for complex layouts but sufficient for simple statements
            pdf_file = io.BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            # Iterate through pages, extract text, concatenate
            text = "\n".join([page.extract_text() for page in pdf_reader.pages])
            return text
    except Exception as e:
        # ERROR HANDLING: PDF parsing can fail for many reasons:
        # - Corrupted PDF, scanned images (no extractable text)
        # - Network timeout, malformed PDF structure
        # We log the error and return None (statement will be skipped)
        print(f"Error reading PDF {pdf_url}: {e}")
        return None

def parse_date(date_str):
    """
    Parse various date string formats into Python datetime object.
    
    DATE FORMAT HETEROGENEITY PROBLEM:
    Federal Reserve website uses inconsistent date formatting:
    - Formal: "January 29, 2025" (HTML pages)
    - Numeric: "01/29/2025", "1/29/2025" (mixed sources)
    - ISO: "2025-01-29" (some metadata)
    - Variations: With/without commas, leading zeros
    
    This function provides robust parsing across all formats.
    """
    # NULL HANDLING: Return None for missing dates
    if pd.isna(date_str) or not date_str:
        return None
    
    # REGEX PATTERN LIBRARY:
    # Each pattern captures (month, day, year) in different formats
    patterns = [
        # PATTERN 1: Full month name (American English format)
        # Examples: "January 29, 2025", "March 15, 2020"
        # Capture groups: (month_name, day, year)
        r"([A-Z][a-z]+)\s+(\d{1,2}),?\s+(\d{4})",
        
        # PATTERN 2: Numeric MM/DD/YYYY (US standard)
        # Examples: "01/29/2025", "3/15/2020" (handles no leading zero)
        # Capture groups: (month, day, year)
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        
        # PATTERN 3: ISO 8601 format YYYY-MM-DD (international standard)
        # Examples: "2025-01-29", "2020-03-15"
        # Capture groups: (year, month, day) - NOTE DIFFERENT ORDER
        r"(\d{4})-(\d{2})-(\d{2})",
    ]
    
    # WATERFALL MATCHING: Try each pattern until one succeeds
    for pattern in patterns:
        match = re.search(pattern, str(date_str))
        if match:
            try:
                # NUMERIC FORMATS (patterns with / or -)
                if "/" in pattern or "-" in pattern:
                    parts = match.groups()  # Extract captured groups
                    if len(parts) == 3:
                        # PATTERN-SPECIFIC PARSING:
                        if "/" in pattern:
                            # MM/DD/YYYY → extract in order
                            month, day, year = parts
                        else:
                            # YYYY-MM-DD → reorder for datetime constructor
                            year, month, day = parts
                        # datetime(year, month, day) requires integer arguments
                        return datetime(int(year), int(month), int(day))
                
                # TEXT FORMAT (month name pattern)
                else:
                    month_name, day, year = match.groups()
                    # MONTH NAME LOOKUP TABLE:
                    # Maps English month names to numeric values (1-12)
                    # Case-insensitive via .lower()
                    month_map = {
                        'january': 1, 'february': 2, 'march': 3, 'april': 4,
                        'may': 5, 'june': 6, 'july': 7, 'august': 8,
                        'september': 9, 'october': 10, 'november': 11, 'december': 12
                    }
                    month = month_map.get(month_name.lower())
                    if month:
                        return datetime(int(year), month, int(day))
            except (ValueError, KeyError):
                # PARSING FAILED: Try next pattern
                # ValueError: Invalid date (e.g., February 30)
                # KeyError: Month name not in lookup table
                continue
    
    # FALLBACK: Use pandas' robust date parser
    # pd.to_datetime handles many edge cases we might miss
    # errors='coerce': Return NaT (Not a Time) instead of raising exception
    try:
        return pd.to_datetime(date_str, errors='coerce')
    except:
        # COMPLETE FAILURE: Return None (will be filtered out later)
        return None

def fetch_all_statement_links():
    """
    Fetch all FOMC statement links from Federal Reserve website.
    
    WEB SCRAPING STRATEGY:
    Federal Reserve organizes FOMC statements across two pages:
    1. Calendar page: Recent/upcoming meetings (2023-2025)
    2. Historical page: Archive of all past meetings (1990s-present)
    
    We scrape both to ensure complete coverage of our analysis period.
    
    Returns:
        list: Dictionaries with keys: meeting_date_str, statement_url, is_pdf
    """
    # DATA STRUCTURES:
    all_meetings = []  # Accumulator for all discovered statements
    seen_urls = set()  # Deduplication (some URLs appear on both pages)
    
    # === SCRAPING SOURCE 1: CALENDAR PAGE ===
    # Contains structured meeting information in HTML panels
    print("Fetching links from calendar page...")
    try:
        # HTTP GET REQUEST:
        # timeout=20: Fail after 20 seconds (prevent infinite hangs)
        resp = SESSION.get(CAL_URL, timeout=20)
        resp.raise_for_status()  # Raise exception for 4xx/5xx errors
        
        # HTML PARSING:
        # BeautifulSoup converts HTML string to navigable tree structure
        # "html.parser": Built-in Python parser (no external dependencies)
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # CSS SELECTOR: Find meeting information containers
        # ".panel.panel-default" targets Bootstrap panel components
        # Each panel = one FOMC meeting with date and associated links
        blocks = soup.select(".panel.panel-default")
        print(f"Found {len(blocks)} panels on calendar page")
        
        # ITERATE THROUGH MEETING PANELS:
        for block in blocks:
            # EXTRACT MEETING DATE:
            # Date typically in header tag (h3, h4, or h5)
            # Fallback chain: try h5 first, then h4, then h3
            header = block.find("h5") or block.find("h4") or block.find("h3")
            if not header:
                continue  # Skip panels without recognizable headers
            
            date_text = header.get_text(strip=True)
            
            # FIND STATEMENT LINKS WITHIN PANEL:
            # Each meeting has multiple documents (statement, minutes, projections)
            # We specifically want the policy statement
            for link in block.select("a"):
                href = link.get("href", "")  # Get URL (default empty string if missing)
                text = link.get_text(strip=True).lower()  # Link text for filtering
                
                # HEURISTIC FILTERING: Identify statement links
                # Multiple conditions to handle varying HTML structures across years
                is_statement = (
                    # Condition 1: Link text contains "statement" AND URL mentions FOMC
                    ("statement" in text and "fomc" in href.lower()) or
                    # Condition 2: URL path indicates monetary policy press release
                    "pressreleases/monetary" in href.lower() or
                    # Condition 3: URL in statements directory with relevant text
                    ("fomcstatements" in href.lower() and ("statement" in text or "htm" in href.lower() or "pdf" in href.lower()))
                )
                
                # EXCLUSION FILTERS: Remove non-statement links
                # "calendar": Links back to calendar page (not statements)
                # "minutes": Meeting minutes (different document type)
                if is_statement and "calendar" not in href.lower() and "minutes" not in href.lower():
                    # URL NORMALIZATION: Convert relative URLs to absolute
                    # urljoin() handles: "/path" → "https://www.federalreserve.gov/path"
                    full_url = urljoin(BASE, href) if not href.startswith("http") else href
                    
                    # DEDUPLICATION: Check if we've seen this URL before
                    # set lookup is O(1) average case (hash table)
                    if full_url not in seen_urls:
                        seen_urls.add(full_url)
                        all_meetings.append({
                            "meeting_date_str": date_text,
                            "statement_url": full_url,
                            "is_pdf": href.lower().endswith(".pdf")  # Format detection
                        })
    except Exception as e:
        # ERROR HANDLING: Network failures, HTML structure changes
        # We continue execution (historical page might still work)
        print(f"Error fetching calendar page: {e}")
    
    # === SCRAPING SOURCE 2: HISTORICAL ARCHIVE ===
    # Different HTML structure than calendar page (less structured)
    # Contains older statements going back to 1990s
    print("Fetching links from historical page...")
    try:
        resp = SESSION.get(HISTORICAL_URL, timeout=20)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # LESS STRUCTURED SCRAPING:
        # Historical page lacks panel structure, so we search all links
        # More false positives possible, hence stricter filtering
        for link in soup.select("a"):
            href = link.get("href", "")
            text = link.get_text(strip=True)
            
            # STATEMENT IDENTIFICATION:
            # Combine URL path check with text content check
            # Both conditions must be true (AND logic) to reduce false positives
            if ("fomcstatements" in href.lower() or "pressreleases/monetary" in href.lower()) and \
               ("statement" in text.lower() or "htm" in href.lower() or "pdf" in href.lower()):
                
                # URL NORMALIZATION:
                full_url = urljoin(BASE, href) if not href.startswith("http") else href
                
                # DEDUPLICATION: Skip if already found on calendar page
                if full_url not in seen_urls:
                    # DATE EXTRACTION CHALLENGE:
                    # Historical page doesn't always have structured date fields
                    # Strategy: Extract from URL filename (YYYYMMDD format) or link text
                    
                    # ATTEMPT 1: Parse date from URL
                    # Example: "/fomcstatements/20200129.htm" → 2020-01-29
                    date_match = re.search(r"(\d{4})(\d{2})(\d{2})", href)
                    if date_match:
                        year, month, day = date_match.groups()
                        # Convert to MM/DD/YYYY format for consistency
                        date_text = f"{int(month)}/{int(day)}/{year}"
                    else:
                        # ATTEMPT 2: Extract date from link text
                        # Example: "January 29, 2020 Statement"
                        date_match = re.search(r"([A-Z][a-z]+ \d{1,2},? \d{4})", text)
                        # Fallback: Use first 50 chars of link text if no date found
                        date_text = date_match.group(1) if date_match else text[:50]
                    
                    seen_urls.add(full_url)
                    all_meetings.append({
                        "meeting_date_str": date_text,
                        "statement_url": full_url,
                        "is_pdf": href.lower().endswith(".pdf")
                    })
    except Exception as e:
        # ERROR HANDLING: Same as calendar page
        print(f"Error fetching historical page: {e}")
    
    # SUMMARY OUTPUT:
    print(f"Total unique statement links found: {len(all_meetings)}")
    return all_meetings

def parse_statement(entry):
    """
    Parse single FOMC statement and extract structured data.
    
    INFORMATION EXTRACTION PIPELINE:
    1. Fetch statement content (HTML or PDF)
    2. Extract release date from metadata
    3. Extract body text from paragraphs
    4. Apply regex patterns to find interest rate values
    5. Clean and convert rate strings to floats
    6. Return structured dictionary
    
    Args:
        entry: Dict with keys 'statement_url', 'is_pdf', 'meeting_date_str'
    
    Returns:
        Dict with parsed data or None if parsing fails
    """
    try:
        url = entry["statement_url"]
        is_pdf = entry.get("is_pdf", False)
        
        # === STEP 1: CONTENT EXTRACTION ===
        # Different extraction logic for PDF vs HTML
        if is_pdf:
            body_text = extract_text_from_pdf(url)
            if not body_text:
                return None  # PDF extraction failed
        else:
            # HTML EXTRACTION WORKFLOW:
            res = SESSION.get(url, timeout=20)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, "html.parser")
            
            # === STEP 2: DATE EXTRACTION ===
            # Federal Reserve uses different HTML classes across years
            # CSS selector with fallback chain (comma-separated alternatives)
            release_date_elem = soup.select_one(".article__date, .releaseDate, time, .col-xs-12.col-sm-8 time")
            if release_date_elem:
                release_date = release_date_elem.get_text(strip=True)
            else:
                # FALLBACK 1: Extract date from URL filename
                # Example: "20200129a.htm" → "01/29/2020"
                date_match = re.search(r"(\d{4})(\d{2})(\d{2})", url)
                if date_match:
                    year, month, day = date_match.groups()
                    release_date = f"{int(month)}/{int(day)}/{year}"
                else:
                    # FALLBACK 2: Use date from link discovery phase
                    release_date = entry["meeting_date_str"]
            
            # === STEP 3: BODY TEXT EXTRACTION ===
            # HTML STRUCTURE VARIATIONS:
            # Federal Reserve redesigned website multiple times (2000s, 2010s, 2020s)
            # Each redesign uses different CSS classes for content
            # We try selectors in order of specificity (most specific first)
            
            # ATTEMPT 1: Modern layout (2020s) - specific column structure
            paragraph_nodes = soup.select("div.col-xs-12.col-sm-8 p")
            if not paragraph_nodes:
                # ATTEMPT 2: Semantic HTML5 layout (2010s)
                paragraph_nodes = soup.select("article p")
            if not paragraph_nodes:
                # ATTEMPT 3: Bootstrap panel layout
                paragraph_nodes = soup.select("div.panel-body p")
            if not paragraph_nodes:
                # ATTEMPT 4: Generic fallback (any paragraph)
                # Less precise but catches edge cases
                paragraph_nodes = soup.select("p")
            
            # TEXT CONCATENATION:
            # Join paragraphs with spaces, normalize whitespace
            # This creates single searchable string for regex matching
            body_text = " ".join(p.get_text(" ", strip=True) for p in paragraph_nodes)
        
        # DATA QUALITY CHECK:
        # Statements should be substantial (>100 chars)
        # This filters out empty pages, error pages, navigation elements
        if not body_text or len(body_text) < 100:
            return None
        
        # === STEP 4: RATE EXTRACTION VIA REGEX ===
        # WATERFALL PATTERN MATCHING:
        # Try each pattern until one succeeds (ordered by specificity)
        match = None
        for pattern in RATE_PATTERNS:
            match = pattern.search(body_text)
            if match:
                break  # Stop at first match (most specific pattern wins)
        
        if not match:
            return None  # No rate information found (unusual but possible)
        
        # === STEP 5: RATE CLEANING AND CONVERSION ===
        # REGEX CAPTURE GROUPS:
        # match.group(1) = lower bound of target range
        # match.group(2) = upper bound of target range
        lower_raw = match.group(1).strip()
        upper_raw = match.group(2).strip()
        
        # NORMALIZATION: Convert strings to floats
        # Handles fractions, decimals, special characters
        lower = clean_rate(lower_raw)
        upper = clean_rate(upper_raw)
        
        # VALIDATION: Both rates must parse successfully
        # Federal Funds Rate is always a range (e.g., 0.00-0.25%)
        if lower is None or upper is None:
            return None
        
        # === STEP 6: DATE PARSING ===
        # Convert date string to datetime object for filtering/sorting
        parsed_date = parse_date(release_date)
        
        # === OUTPUT: STRUCTURED DATA ===
        # Return dictionary with all extracted information
        # This will become one row in our pandas DataFrame
        return {
            "meeting_date": release_date,  # Original string
            "meeting_date_parsed": parsed_date,  # datetime object
            "statement_url": url,
            "target_lower": lower,  # Float (e.g., 0.25)
            "target_upper": upper,  # Float (e.g., 0.50)
            "is_pdf": is_pdf
        }
    except Exception as e:
        # COMPREHENSIVE ERROR HANDLING:
        # Catches: Network errors, HTML parsing failures, regex errors, type conversions
        # We log error but return None (skip this statement, continue with others)
        print(f"Error parsing {entry['statement_url']}: {e}")
        return None

# === MAIN EXECUTION: DATA COLLECTION PIPELINE ===
# This orchestrates the complete scraping workflow:
# 1. Discover statement URLs
# 2. Parse each statement
# 3. Clean and structure data
# 4. Compute derived features (rate changes)

print("="*60)
print("FOMC Statement Scraper - January 2020 to October 29, 2025")
print("="*60)

# === PHASE 1: LINK DISCOVERY ===
# Crawl Federal Reserve website to find all FOMC statement URLs
# Returns list of dictionaries with URL, date, format metadata
all_links = fetch_all_statement_links()

# === PHASE 2: CONTENT PARSING ===
# Extract structured data from each statement
# This is the most time-consuming phase (HTTP requests + parsing)
print(f"\nParsing {len(all_links)} statements...")
rows = []  # Accumulator for successfully parsed statements
for i, link in enumerate(all_links, 1):
    # PROGRESS REPORTING: Print status every 10 statements
    # Helps monitor long-running scraping jobs
    if i % 10 == 0:
        print(f"  Processed {i}/{len(all_links)}...")
    
    # PARSE INDIVIDUAL STATEMENT:
    # Returns dict or None (if parsing failed)
    parsed = parse_statement(link)
    if parsed:
        rows.append(parsed)  # Only keep successful parses

print(f"\nSuccessfully parsed {len(rows)} statements")

# === PHASE 3: DATA STRUCTURING ===
# Convert list of dictionaries to pandas DataFrame for analysis

# ERROR HANDLING: Ensure we got at least some data
if not rows:
    raise ValueError("No statements parsed successfully!")

# CREATE DATAFRAME:
# Each row = one FOMC meeting
# Columns: meeting_date, statement_url, target_lower, target_upper, is_pdf
df = pd.DataFrame(rows)

# === PHASE 4: DATA CLEANING ===

# DATE FILTERING:
# Restrict to analysis timeframe (2020-2025) to match Bitcoin data
# pd.to_datetime with errors='coerce': Invalid dates → NaT (Not a Time)
df['meeting_date_parsed'] = pd.to_datetime(df['meeting_date_parsed'], errors='coerce')
# Drop rows with unparseable dates
df = df[df['meeting_date_parsed'].notna()]
# Boolean indexing: Keep only dates within range
df = df[(df['meeting_date_parsed'] >= START_DATE) & (df['meeting_date_parsed'] <= END_DATE)]

# CHRONOLOGICAL SORTING:
# Essential for time-series analysis (rate change calculation requires order)
df = df.sort_values('meeting_date_parsed')

# === PHASE 5: FEATURE ENGINEERING ===

# RATE CHANGE CALCULATION:
# diff() computes first difference: value[i] - value[i-1]
# This shows how much rates changed from previous meeting
# NaN in first row (no previous meeting to compare)
df['delta_lower'] = df['target_lower'].diff()
df['delta_upper'] = df['target_upper'].diff()

# BINARY RATE CHANGE INDICATOR:
# Create categorical variable: 1 = rate changed, 0 = rate unchanged
# LOGIC: Rate changed if EITHER bound changed (and change is not NaN)
# Why check both bounds? Rate changes always affect both symmetrically
# Why check .notna()? First row has NaN delta (no previous meeting)
df['rate_changed'] = (
    ((df['delta_lower'].notna()) & (df['delta_lower'] != 0)) | 
    ((df['delta_upper'].notna()) & (df['delta_upper'] != 0))
).astype(int)  # Convert boolean to 0/1

# === PHASE 6: SUMMARY STATISTICS ===
# Data quality reporting for validation
print(f"\nFinal dataset: {len(df)} statements from {df['meeting_date_parsed'].min()} to {df['meeting_date_parsed'].max()}")
print(f"\nPDF statements: {df['is_pdf'].sum()}")
print(f"HTML statements: {(~df['is_pdf']).sum()}")
print(f"Rate changes: {df['rate_changed'].sum()} out of {len(df)} meetings")

# PREVIEW OUTPUT:
# Display first 20 rows for visual inspection
# Jupyter automatically renders DataFrame as formatted HTML table
df.head(20)

FOMC Statement Scraper - January 2020 to October 29, 2025
Fetching links from calendar page...
Found 8 panels on calendar page
Fetching links from historical page...
Total unique statement links found: 107

Parsing 107 statements...
  Processed 10/107...
  Processed 20/107...
  Processed 30/107...
  Processed 40/107...
  Processed 50/107...
  Processed 60/107...
  Processed 70/107...
  Processed 80/107...
  Processed 90/107...
  Processed 100/107...

Successfully parsed 45 statements

Final dataset: 45 statements from 2020-01-29 00:00:00 to 2025-07-30 00:00:00

PDF statements: 0
HTML statements: 45
Rate changes: 14 out of 45 meetings


Unnamed: 0,meeting_date,meeting_date_parsed,statement_url,target_lower,target_upper,is_pdf,delta_lower,delta_upper,rate_changed
34,1/29/2020,2020-01-29,https://www.federalreserve.gov/newsevents/pres...,1.5,1.75,False,,,0
35,3/3/2020,2020-03-03,https://www.federalreserve.gov/newsevents/pres...,1.0,1.25,False,-0.5,-0.5,1
37,3/15/2020,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,-1.0,-1.0,1
36,3/15/2020,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
38,3/23/2020,2020-03-23,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
39,4/29/2020,2020-04-29,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
40,6/10/2020,2020-06-10,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
41,7/29/2020,2020-07-29,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
42,9/16/2020,2020-09-16,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
43,11/5/2020,2020-11-05,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0


In [10]:
# === DATA VALIDATION: 2022 FOMC MEETINGS ===
# EXPLORATORY ANALYSIS: Inspect specific year's data
# 2022 was significant: Fed raised rates 7 times (0% → 4.5%) fighting inflation
# This validates our scraper captured the aggressive tightening cycle

# PANDAS DATETIME ACCESSOR:
# .dt.year extracts year component from datetime column
# Returns boolean mask: True for 2022 dates, False otherwise
df[df['meeting_date_parsed'].dt.year == 2022]


Unnamed: 0,meeting_date,meeting_date_parsed,statement_url,target_lower,target_upper,is_pdf,delta_lower,delta_upper,rate_changed
19,1/26/2022,2022-01-26,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,0.0,0.0,0
20,3/16/2022,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,0.25,0.5,False,0.25,0.25,1
21,5/4/2022,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,0.75,1.0,False,0.5,0.5,1
22,7/27/2022,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,2.25,2.5,False,1.5,1.5,1
23,9/21/2022,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,3.0,3.25,False,0.75,0.75,1
24,11/2/2022,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,3.75,4.0,False,0.75,0.75,1
25,12/14/2022,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,4.25,4.5,False,0.5,0.5,1


In [11]:
# === REGEX PATTERN TESTING: UNIT TEST ===
# SOFTWARE ENGINEERING BEST PRACTICE: Test complex functions in isolation
# Purpose: Validate our regex patterns work on known input before full scraping
# This is especially important for regex (easy to miss edge cases)

# TEST CASE: March 16, 2022 statement (first rate hike of 2022 cycle)
# This statement uses fractional notation "1/4 to 1/2" instead of decimals
# Historical significance: First rate increase in 3 years (since 2019)
test_text = "In support of these goals, the Committee decided to raise the target range for the federal funds rate to 1/4 to 1/2 percent"

print("Testing rate extraction from sample text:")
print(f"Text: {test_text}\n")

# PATTERN MATCHING DIAGNOSTIC:
# Try each regex pattern sequentially, report which one matches
match = None
for i, pattern in enumerate(RATE_PATTERNS, 1):
    match = pattern.search(test_text)
    if match:
        print(f"✓ Pattern {i} MATCHED!")
        print(f"  Lower: '{match.group(1)}'")  # Raw captured string
        print(f"  Upper: '{match.group(2)}'")
        
        # TEST CLEANING FUNCTION:
        # Verify fraction → decimal conversion works correctly
        lower = clean_rate(match.group(1))
        upper = clean_rate(match.group(2))
        print(f"  Parsed Lower: {lower}")  # Should be 0.25
        print(f"  Parsed Upper: {upper}")  # Should be 0.5
        break

if not match:
    print("✗ No pattern matched!")
    # This would indicate our regex library is incomplete
    
# === CROSS-VALIDATION: Check scraper found this meeting ===
print("\n" + "="*60)
print("Checking for March 16, 2022 in dataframe:")

# DATE FILTERING:
# .dt.strftime('%Y-%m-%d') formats datetime as ISO string for exact comparison
march_2022 = df[df['meeting_date_parsed'].dt.strftime('%Y-%m-%d') == '2022-03-16']

if len(march_2022) > 0:
    print("Found March 16, 2022 statement:")
    # Display parsed data to verify correctness
    print(march_2022[['meeting_date_parsed', 'target_lower', 'target_upper', 'rate_changed']])
else:
    # DEBUGGING: If not found, show what 2022 dates we DO have
    print("March 16, 2022 NOT found in dataframe")
    print("\nAll 2022 statements:")
    print(df[df['meeting_date_parsed'].dt.year == 2022][['meeting_date_parsed', 'target_lower', 'target_upper', 'rate_changed']])


Testing rate extraction from sample text:
Text: In support of these goals, the Committee decided to raise the target range for the federal funds rate to 1/4 to 1/2 percent

✓ Pattern 1 MATCHED!
  Lower: '1/4'
  Upper: '1/2'
  Parsed Lower: 0.25
  Parsed Upper: 0.5

Checking for March 16, 2022 in dataframe:
Found March 16, 2022 statement:
   meeting_date_parsed  target_lower  target_upper  rate_changed
20          2022-03-16          0.25           0.5             1


In [12]:
# === FILTERING FOR MONETARY POLICY ACTIONS ===
# ECONOMIC INTERPRETATION:
# Rate changes represent actual Federal Reserve policy interventions
# These are the most market-moving FOMC meetings (vs status quo announcements)
# Hypothesis: Bitcoin volatility higher on rate change days

# BOOLEAN INDEXING:
# Select only rows where rate_changed == 1 (policy action taken)
# This should show:
# - 2020: Emergency rate cuts during COVID (March 3, March 15)
# - 2022-2023: Aggressive tightening cycle (7 hikes in 2022)
# - 2024-2025: Rate cuts as inflation moderates
df[df['rate_changed'] == 1]

Unnamed: 0,meeting_date,meeting_date_parsed,statement_url,target_lower,target_upper,is_pdf,delta_lower,delta_upper,rate_changed
35,3/3/2020,2020-03-03,https://www.federalreserve.gov/newsevents/pres...,1.0,1.25,False,-0.5,-0.5,1
37,3/15/2020,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,0.0,0.25,False,-1.0,-1.0,1
20,3/16/2022,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,0.25,0.5,False,0.25,0.25,1
21,5/4/2022,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,0.75,1.0,False,0.5,0.5,1
22,7/27/2022,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,2.25,2.5,False,1.5,1.5,1
23,9/21/2022,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,3.0,3.25,False,0.75,0.75,1
24,11/2/2022,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,3.75,4.0,False,0.75,0.75,1
25,12/14/2022,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,4.25,4.5,False,0.5,0.5,1
11,2/1/2023,2023-02-01,https://www.federalreserve.gov/newsevents/pres...,4.5,4.75,False,0.25,0.25,1
12,3/22/2023,2023-03-22,https://www.federalreserve.gov/newsevents/pres...,4.75,5.0,False,0.25,0.25,1


In [19]:
# === DATA EXPORT: Save for Bitcoin Volatility Analysis ===
# PURPOSE: Create feature for main machine learning pipeline
# This CSV will be loaded in Final_Project_.ipynb to create binary indicators

# DATA SELECTION:
# Extract only meeting_date_parsed column for rate change meetings
# Why only this column? Main analysis only needs dates (not rate magnitudes)
# This creates a clean lookup table: "Was today a rate change day?"

# EXPORT PARAMETERS:
# index=False: Don't write row numbers to CSV (unnecessary for date lookup)
# Result: Single-column CSV with just dates of rate changes
df[df['rate_changed'] == 1][['meeting_date_parsed']].to_csv('FOMC_Dates_with_Rate_Changes.csv', index=False)

# OUTPUT FILE USAGE:
# In main analysis: pd.read_csv('FOMC_Dates_with_Rate_Changes.csv')
# Then: data['Is_FOMC_Day'] = data.index.isin(fomc_dates).astype(int)
# This creates binary feature for supervised learning model
