In [0]:
# FRED Quarterly Pipeline: 03_fred_quarterly_pipeline

import requests
import pandas as pd
from datetime import datetime
import time
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType

start_time = time.time()

# === Config ===
API_KEY = "0edfc0525246965667057e6f44062902"

def get_series(series_id, label):
    url = "https://api.stlouisfed.org/fred/series/observations"
    params = {
        "series_id": series_id,
        "api_key": API_KEY,
        "file_type": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()

    if "observations" not in data:
        print(f"\u26a0\ufe0f No observations found for series_id: {series_id}")
        print("Response:", data)
        return pd.DataFrame(columns=["date", label])

    df = pd.DataFrame(data['observations'])[['date', 'value']]
    df['date'] = pd.to_datetime(df['date'])
    df[label] = pd.to_numeric(df['value'], errors='coerce')
    return df[['date', label]]

# === Indicators (Quarterly) ===
quarterly_indicators = {
    "GDP": "GDPC1",
    "CorporateProfits": "CP",
    "GrossPrivateDomesticInvestment": "GPDI"
}

# === Build df_quarterly ===
df_quarterly = None
for label, series_id in quarterly_indicators.items():
    df = get_series(series_id, label)
    df_quarterly = df if df_quarterly is None else df_quarterly.merge(df, on="date", how="outer")

# === Clean ===
def clean_fred_df(df: pd.DataFrame, value_columns: list) -> pd.DataFrame:
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    for col in value_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.dropna(subset=value_columns, how='all')
    df = df.sort_values("date").reset_index(drop=True)
    return df

quarterly_cols = list(quarterly_indicators.keys())
df_quarterly = clean_fred_df(df_quarterly, quarterly_cols)

# === Enrich ===
df_quarterly['GDP_QoQ_Growth'] = df_quarterly['GDP'].pct_change()
df_quarterly['GDP_YoY_Growth'] = df_quarterly['GDP'].pct_change(periods=4)
df_quarterly['GDP_MA_4'] = df_quarterly['GDP'].rolling(window=4).mean()
df_quarterly['Year'] = df_quarterly['date'].dt.year
df_quarterly['Quarter'] = df_quarterly['date'].dt.quarter

# === Convert to Spark & Save with Schema Merge ===
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark_df_quarterly = spark.createDataFrame(df_quarterly)
spark_df_quarterly.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("/mnt/datalake/fred/quarterly")

# === Metadata Logging ===
end_time = time.time()
duration = int(end_time - start_time)
row_count = spark_df_quarterly.count()

log_data = [( 
    "fred_quarterly_pipeline",
    "fact_macro_quarterly",
    datetime.utcnow(),
    row_count,
    "success",
    duration,
    "quarterly" 
)]

log_schema = StructType([
    StructField("job_name", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("run_date", TimestampType(), True),
    StructField("row_count", LongType(), True),
    StructField("status", StringType(), True),
    StructField("duration_sec", LongType(), True),
    StructField("frequency", StringType(), True)
])

log_df = spark.createDataFrame(log_data, schema=log_schema)

log_df.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("frequency") \
    .save("/mnt/datalake/fred/logs/job_metadata")
    