In [0]:
# FRED Daily Pipeline: 01_fred_daily_pipeline

import requests
import pandas as pd
from datetime import datetime
import time
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType

start_time = time.time()

# === Config ===
API_KEY = "0edfc0525246965667057e6f44062902"

def get_series(series_id, label):
    url = "https://api.stlouisfed.org/fred/series/observations"
    params = {
        "series_id": series_id,
        "api_key": API_KEY,
        "file_type": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()
    df = pd.DataFrame(data['observations'])[['date', 'value']]
    df['date'] = pd.to_datetime(df['date'])
    df[label] = pd.to_numeric(df['value'], errors='coerce')
    return df[['date', label]]

# === Indicators (Daily) ===
daily_indicators = {
    "FederalFundsRate": "FEDFUNDS",
    "OilPriceWTI": "DCOILWTICO",
    "10YrTreasury": "GS10"
}

# === Build df_daily ===
df_daily = None
for label, series_id in daily_indicators.items():
    df = get_series(series_id, label)
    df_daily = df if df_daily is None else df_daily.merge(df, on="date", how="inner")

# === Clean ===
def clean_fred_df(df: pd.DataFrame, value_columns: list) -> pd.DataFrame:
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    for col in value_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.dropna(subset=value_columns, how='all')
    df = df.sort_values("date").reset_index(drop=True)
    return df

daily_cols = list(daily_indicators.keys())
df_daily = clean_fred_df(df_daily, daily_cols)

# === Enrich ===
df_daily['OilPctChange'] = df_daily['OilPriceWTI'].pct_change()
df_daily['10YrPctChange'] = df_daily['10YrTreasury'].pct_change()
df_daily['FedFundsChange'] = df_daily['FederalFundsRate'].diff()
df_daily['RateSpread'] = df_daily['10YrTreasury'] - df_daily['FederalFundsRate']
df_daily['Oil_MA_7'] = df_daily['OilPriceWTI'].rolling(7).mean()
df_daily['FedFunds_MA_14'] = df_daily['FederalFundsRate'].rolling(14).mean()
df_daily['OilVolatility_7d'] = df_daily['OilPriceWTI'].rolling(7).std()
df_daily['IsRateHike'] = df_daily['FedFundsChange'] > 0
df_daily['IsOilRising'] = df_daily['OilPctChange'] > 0
df_daily['Year'] = df_daily['date'].dt.year
df_daily['Month'] = df_daily['date'].dt.month
df_daily['Weekday'] = df_daily['date'].dt.dayofweek

# === Convert to Spark & Save with Schema Merge ===
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark_df_daily = spark.createDataFrame(df_daily)
spark_df_daily.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("/mnt/datalake/fred/daily")

# === Metadata Logging ===
end_time = time.time()
duration = int(end_time - start_time)
row_count = spark_df_daily.count()

log_data = [( 
    "fred_daily_pipeline",
    "fact_macro_daily",
    datetime.utcnow(),
    row_count,
    "success",
    duration,
    "daily"
)]

log_schema = StructType([
    StructField("job_name", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("run_date", TimestampType(), True),
    StructField("row_count", LongType(), True),
    StructField("status", StringType(), True),
    StructField("duration_sec", LongType(), True),
    StructField("frequency", StringType(), True)
])

log_df = spark.createDataFrame(log_data, schema=log_schema)

log_df.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("frequency") \
    .save("/mnt/datalake/fred/logs/job_metadata")