## Step 1: Import Libraries

In [1]:
%run ../make_clean_names.py

In [2]:
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging

import polars as pl
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_session():
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    session.mount('https://', HTTPAdapter(max_retries=retries, pool_maxsize=10))
    return session

# Get API key from environment variables
FRED_API_KEY = os.getenv('FRED_API_KEY')
if not FRED_API_KEY:
    raise ValueError("FRED_API_KEY not found in environment variables")

## Step 2: Extract Data from FRED into Polars

In [3]:
# Constants
FRED_API_BASE_URL = "https://api.stlouisfed.org/fred/series/observations"
REQUEST_TIMEOUT = 10

# Series configurations with direct mapping
FRED_SERIES_DESCRIPTIONS = {
    'UNRATE': 'Unemployment Rate',
    'GDP': 'Gross Domestic Product',
    'FPCPITOTLZGUSA': 'Inflation Rate',
    'DFF': 'Federal Funds Rate',
    'DEXUSEU': 'USD/EUR Exchange Rate'
}

start_date = '2020-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')

def fetch_data(series_id: str, api_key: str, session: requests.Session) -> Optional[List[Dict]]:
    """
    Fetch economic data from FRED API for a single series.
    """
    params = {
        "api_key": api_key,
        "series_id": series_id,
        "file_type": "json",
        "sort_order": "desc",
        "observation_start": start_date,
    }
    
    try:
        response = session.get(
            FRED_API_BASE_URL, 
            params=params, 
            timeout=REQUEST_TIMEOUT
        )
        response.raise_for_status()
        
        observations = response.json().get('observations', [])
        description = FRED_SERIES_DESCRIPTIONS.get(series_id, 'Unknown')
        
        for obs in observations:
            obs['series_id'] = series_id
            obs['series_description'] = description
                
        return observations
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {series_id}: {str(e)}")
        return None
    
def fetch_all_data(series_ids: List[str], api_key: str) -> List[Dict]:
    """Fetch data for multiple series concurrently"""
    session = requests.Session()
    
    with ThreadPoolExecutor(max_workers=min(10, len(series_ids))) as executor:
        futures = [
            executor.submit(fetch_data, series_id, api_key, session)
            for series_id in series_ids
        ]
        results = []
        for f in concurrent.futures.as_completed(futures):
            if f.result():
                results.extend(f.result())
    
    return results

In [4]:
# Execute fetching and create DataFrame
try:
    data = fetch_all_data(list(FRED_SERIES_DESCRIPTIONS.keys()), FRED_API_KEY)
    if data:
        df = pl.DataFrame(data)
        logger.info(f"Successfully created DataFrame with {len(df)} rows")
        print(df)
    else:
        logger.error("No data received from FRED API")
except Exception as e:
    logger.error(f"Error processing data: {str(e)}")

INFO:__main__:Successfully created DataFrame with 3223 rows


shape: (3_223, 6)
┌────────────────┬──────────────┬────────────┬───────┬───────────┬────────────────────┐
│ realtime_start ┆ realtime_end ┆ date       ┆ value ┆ series_id ┆ series_description │
│ ---            ┆ ---          ┆ ---        ┆ ---   ┆ ---       ┆ ---                │
│ str            ┆ str          ┆ str        ┆ str   ┆ str       ┆ str                │
╞════════════════╪══════════════╪════════════╪═══════╪═══════════╪════════════════════╡
│ 2025-01-08     ┆ 2025-01-08   ┆ 2024-11-01 ┆ 4.2   ┆ UNRATE    ┆ Unemployment Rate  │
│ 2025-01-08     ┆ 2025-01-08   ┆ 2024-10-01 ┆ 4.1   ┆ UNRATE    ┆ Unemployment Rate  │
│ 2025-01-08     ┆ 2025-01-08   ┆ 2024-09-01 ┆ 4.1   ┆ UNRATE    ┆ Unemployment Rate  │
│ 2025-01-08     ┆ 2025-01-08   ┆ 2024-08-01 ┆ 4.2   ┆ UNRATE    ┆ Unemployment Rate  │
│ 2025-01-08     ┆ 2025-01-08   ┆ 2024-07-01 ┆ 4.3   ┆ UNRATE    ┆ Unemployment Rate  │
│ …              ┆ …            ┆ …          ┆ …     ┆ …         ┆ …                  │
│ 2025-01-08  

## Step 4: Clean Column Names

In [5]:
df = make_clean_names(df)

In [6]:
df = df.select("date", "series_id", "series_description", "value")

In [7]:
df

date,series_id,series_description,value
str,str,str,str
"""2024-11-01""","""UNRATE""","""Unemployment Rate""","""4.2"""
"""2024-10-01""","""UNRATE""","""Unemployment Rate""","""4.1"""
"""2024-09-01""","""UNRATE""","""Unemployment Rate""","""4.1"""
"""2024-08-01""","""UNRATE""","""Unemployment Rate""","""4.2"""
"""2024-07-01""","""UNRATE""","""Unemployment Rate""","""4.3"""
…,…,…,…
"""2020-01-05""","""DFF""","""Federal Funds Rate""","""1.55"""
"""2020-01-04""","""DFF""","""Federal Funds Rate""","""1.55"""
"""2020-01-03""","""DFF""","""Federal Funds Rate""","""1.55"""
"""2020-01-02""","""DFF""","""Federal Funds Rate""","""1.55"""


## Step 5: Write Polars to Parquet

In [8]:
output_dir = "../../../data/finance"

# Write DataFrame to Parquet
df.write_parquet(f'{output_dir}/fred_macro_economy.parquet')

## Step 6: Read Parquet (Validate)

In [9]:
pl.scan_parquet(f'{output_dir}/fred_macro_economy.parquet').head().collect()

date,series_id,series_description,value
str,str,str,str
"""2024-11-01""","""UNRATE""","""Unemployment Rate""","""4.2"""
"""2024-10-01""","""UNRATE""","""Unemployment Rate""","""4.1"""
"""2024-09-01""","""UNRATE""","""Unemployment Rate""","""4.1"""
"""2024-08-01""","""UNRATE""","""Unemployment Rate""","""4.2"""
"""2024-07-01""","""UNRATE""","""Unemployment Rate""","""4.3"""
