In [None]:
#!git clone https://github.com/sjiang83/fomc-ood-stress-test.git
#%cd fomc-ood-stress-test
#!pip install -r requirements.txt

In [9]:
"""
Notebook 1: Data Collection & Cleaning (2024-2025 FOMC Statements)
Purpose: Load, parse, and segment FOMC statements for model-ready processing
Author: Shanhuizi (Mia) Jiang
GitHub: https://github.com/sjiang83/fomc-ood-stress-test
"""

# ============================================================
# SETUP: Import Dependencies
# ============================================================

import os
import re
import json
from pathlib import Path
from datetime import datetime
import pandas as pd

# Optional: For future parsing improvements
# from bs4 import BeautifulSoup  # If scraping HTML versions

print("=" * 60)
print("Notebook 1: Data Collection & Cleaning")
print("Target: 2024-2025 FOMC Statements (OOD Data)")
print("=" * 60)
print()


# ============================================================
# CONFIGURATION
# ============================================================

# Project paths
PROJECT_ROOT = Path("..").resolve()  # Assumes notebook is in notebooks/
RAW_DATA_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DATA_DIR = PROJECT_ROOT / "data" / "processed"

# Ensure processed directory exists
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Raw data directory: {RAW_DATA_DIR}")
print(f"üìÅ Processed data directory: {PROCESSED_DATA_DIR}")
print()


# ============================================================
# STEP 1: Load Raw FOMC Statements
# ============================================================

def load_fomc_statements(data_dir):
    """
    Load all FOMC statement text files from raw data directory.

    Args:
        data_dir (Path): Directory containing raw FOMC .txt files

    Returns:
        dict: {filename: text_content}
    """
    statements = {}
    txt_files = sorted(data_dir.glob("fomc_*.txt"))

    if not txt_files:
        print("‚ö†Ô∏è  No FOMC statement files found!")
        print(f"   Expected pattern: fomc_YYYY_MM_DD_statement.txt in {data_dir}")
        return statements

    print(f"Found {len(txt_files)} FOMC statement files:")

    for txt_file in txt_files:
        try:
            with open(txt_file, 'r', encoding='utf-8') as f:
                content = f.read()
            statements[txt_file.name] = content

            # Extract date from filename for display
            date_match = re.search(r'(\d{4})_(\d{2})_(\d{2})', txt_file.name)
            if date_match:
                year, month, day = date_match.groups()
                print(f"  ‚úì {year}-{month}-{day}: {len(content)} characters")
            else:
                print(f"  ‚úì {txt_file.name}: {len(content)} characters")

        except Exception as e:
            print(f"  ‚úó Failed to load {txt_file.name}: {e}")

    return statements

# Load statements
raw_statements = load_fomc_statements(RAW_DATA_DIR)
print(f"\n‚úì Loaded {len(raw_statements)} statements successfully")
print()


# ============================================================
# STEP 2: Basic Text Cleaning
# ============================================================

def clean_fomc_text(text):
    """
    Basic cleaning for FOMC statement text.

    Operations:
    - Remove extra whitespace
    - Normalize line breaks
    - Remove page numbers/headers (if present)

    Args:
        text (str): Raw FOMC statement text

    Returns:
        str: Cleaned text
    """
    # Remove common headers/footers patterns
    text = re.sub(r'For (immediate )?release.*?\n', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Page \d+ of \d+', '', text)

    # Normalize whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r' +', ' ', text)  # Multiple spaces -> single space

    # Remove leading/trailing whitespace
    text = text.strip()

    return text


print("=" * 60)
print("STEP 2: Cleaning Text")
print("=" * 60)

cleaned_statements = {}
for filename, content in raw_statements.items():
    cleaned = clean_fomc_text(content)
    cleaned_statements[filename] = cleaned

    # Show cleaning stats
    original_lines = len(content.split('\n'))
    cleaned_lines = len(cleaned.split('\n'))
    print(f"{filename}: {original_lines} ‚Üí {cleaned_lines} lines")

print(f"\n‚úì Cleaned {len(cleaned_statements)} statements")
print()


# ============================================================
# STEP 3: Sentence Segmentation
# ============================================================

def segment_into_sentences(text):
    """
    Segment FOMC statement into sentences for model inference.

    Uses simple rule-based approach:
    - Split on periods followed by space and capital letter
    - Handle common abbreviations (e.g., "U.S.", "etc.")

    Args:
        text (str): Cleaned FOMC statement text

    Returns:
        list: List of sentence strings
    """
    # Protect common abbreviations
    text = text.replace('U.S.', 'U_S_')
    text = text.replace('e.g.', 'e_g_')
    text = text.replace('i.e.', 'i_e_')

    # Simple sentence split: period + space + capital letter
    sentences = re.split(r'\.\s+(?=[A-Z])', text)

    # Restore abbreviations and clean up
    sentences = [s.replace('U_S_', 'U.S.').replace('e_g_', 'e.g.').replace('i_e_', 'i.e.').strip()
                 for s in sentences]

    # Filter out very short fragments (likely formatting artifacts)
    sentences = [s for s in sentences if len(s) > 20]

    return sentences


print("=" * 60)
print("STEP 3: Sentence Segmentation")
print("=" * 60)

segmented_data = {}

for filename, content in cleaned_statements.items():
    sentences = segment_into_sentences(content)
    segmented_data[filename] = sentences

    # Extract date from filename
    date_match = re.search(r'(\d{4}_\d{2}_\d{2})', filename)
    date_str = date_match.group(1) if date_match else filename

    print(f"{date_str}: {len(sentences)} sentences")

    # Show first sentence as preview
    if sentences:
        preview = sentences[0][:100] + "..." if len(sentences[0]) > 100 else sentences[0]
        print(f"  Preview: {preview}")

print(f"\n‚úì Segmented {len(segmented_data)} statements into sentences")
print()


# ============================================================
# STEP 4: Create Model-Ready DataFrame
# ============================================================

print("=" * 60)
print("STEP 4: Creating Model-Ready Dataset")
print("=" * 60)

# Build dataset with metadata
dataset_rows = []

for filename, sentences in segmented_data.items():
    # Extract date from filename
    date_match = re.search(r'(\d{4})_(\d{2})_(\d{2})', filename)
    if date_match:
        year, month, day = date_match.groups()
        date_str = f"{year}-{month}-{day}"
    else:
        date_str = "unknown"

    for idx, sentence in enumerate(sentences):
        dataset_rows.append({
            'statement_date': date_str,
            'filename': filename,
            'sentence_id': idx,
            'text': sentence,
            'char_length': len(sentence),
            'word_count': len(sentence.split())
        })

# Create DataFrame
df_fomc = pd.DataFrame(dataset_rows)

# Display summary statistics
print(f"Total sentences: {len(df_fomc)}")
print(f"Date range: {df_fomc['statement_date'].min()} to {df_fomc['statement_date'].max()}")
print(f"\nDataset shape: {df_fomc.shape}")
print("\nFirst few rows:")
print(df_fomc.head())

print("\nüìä Text Length Statistics:")
print(df_fomc[['char_length', 'word_count']].describe())


# ============================================================
# STEP 5: Save Processed Data
# ============================================================

print("\n" + "=" * 60)
print("STEP 5: Saving Processed Data")
print("=" * 60)

# Save as CSV
csv_path = PROCESSED_DATA_DIR / "fomc_2024_2025_sentences.csv"
df_fomc.to_csv(csv_path, index=False)
print(f"‚úì Saved CSV: {csv_path}")

# Save as JSON (useful for some NLP workflows)
json_path = PROCESSED_DATA_DIR / "fomc_2024_2025_sentences.json"
df_fomc.to_json(json_path, orient='records', indent=2)
print(f"‚úì Saved JSON: {json_path}")

# Save summary metadata
metadata = {
    'processing_date': datetime.now().isoformat(),
    'num_statements': len(segmented_data),
    'num_sentences': len(df_fomc),
    'date_range': {
        'start': df_fomc['statement_date'].min(),
        'end': df_fomc['statement_date'].max()
    },
    'files_processed': list(raw_statements.keys())
}

metadata_path = PROCESSED_DATA_DIR / "processing_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"‚úì Saved metadata: {metadata_path}")


# ============================================================
# STEP 6: Data Quality Checks
# ============================================================

print("\n" + "=" * 60)
print("STEP 6: Data Quality Checks")
print("=" * 60)

# Check 1: No empty sentences
empty_count = (df_fomc['text'].str.len() == 0).sum()
print(f"Empty sentences: {empty_count}")

# Check 2: Date parsing success
unknown_dates = (df_fomc['statement_date'] == 'unknown').sum()
print(f"Unknown dates: {unknown_dates}")

# Check 3: Sentence length distribution
print("\nSentence length (words) distribution:")
print(df_fomc['word_count'].describe())

# Check 4: Look for potential issues
print("\nSample of shortest sentences (potential artifacts):")
print(df_fomc.nsmallest(3, 'word_count')[['statement_date', 'text']])

print("\nSample of longest sentences (check for segmentation errors):")
print(df_fomc.nlargest(3, 'word_count')[['statement_date', 'text', 'word_count']])


# ============================================================
# SUMMARY & NEXT STEPS
# ============================================================

print("\n" + "=" * 60)
print("‚úì DATA COLLECTION & CLEANING COMPLETE")
print("=" * 60)

print(f"""
üìã Summary:
  ‚Ä¢ Processed {len(raw_statements)} FOMC statements from 2024-2025
  ‚Ä¢ Extracted {len(df_fomc)} sentences total
  ‚Ä¢ Output saved to: {PROCESSED_DATA_DIR}

üìÅ Output Files:
  1. fomc_2024_2025_sentences.csv - Main dataset
  2. fomc_2024_2025_sentences.json - JSON format
  3. processing_metadata.json - Processing details

üîç Data Quality:
  ‚Ä¢ Average sentence length: {df_fomc['word_count'].mean():.1f} words
  ‚Ä¢ Min: {df_fomc['word_count'].min()} words
  ‚Ä¢ Max: {df_fomc['word_count'].max()} words

üìù Next Steps:
  1. ‚úì Data collected and cleaned
  2. ‚Üí Proceed to notebooks/2_ood_generalization_cases.ipynb
     Load FOMC-RoBERTa model and run inference on these sentences
  3. ‚Üí Identify failure cases where model misinterprets 2024-2025 narratives
  4. ‚Üí Connect sentiment scores to market data in notebook 3

üí° Notes for FSIL Review:
  ‚Ä¢ This pipeline handles raw Fed website text ‚Üí model-ready sentences
  ‚Ä¢ Segmentation is intentionally simple (rule-based) to avoid introducing bias
  ‚Ä¢ More sophisticated NLP parsing (spaCy, etc.) can be added if needed
  ‚Ä¢ Current approach prioritizes transparency and reproducibility
""")

print("\n" + "=" * 60)
print("Ready for OOD Analysis!")
print("=" * 60)

Notebook 1: Data Collection & Cleaning
Target: 2024-2025 FOMC Statements (OOD Data)

üìÅ Raw data directory: /content/fomc-ood-stress-test/data/raw
üìÅ Processed data directory: /content/fomc-ood-stress-test/data/processed

Found 3 FOMC statement files:
  ‚úì 2024-01-31: 1883 characters
  ‚úì 2024-03-20: 1849 characters
  ‚úì 2024-05-01: 2049 characters

‚úì Loaded 3 statements successfully

STEP 2: Cleaning Text
fomc_2024_01_31_statement.txt: 5 ‚Üí 4 lines
fomc_2024_03_20_statement.txt: 5 ‚Üí 4 lines
fomc_2024_05_01_statement.txt: 5 ‚Üí 4 lines

‚úì Cleaned 3 statements

STEP 3: Sentence Segmentation
2024_01_31: 14 sentences
  Preview: Recent indicators suggest that economic activity has been expanding at a solid pace
2024_03_20: 14 sentences
  Preview: Recent indicators suggest that economic activity has been expanding at a solid pace
2024_05_01: 16 sentences
  Preview: Recent indicators suggest that economic activity has continued to expand at a solid pace

‚úì Segmented 3 stateme