In [0]:
secret_scope_name = "fmdp-secrets"
client_id = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-databricks-sp-client-id")
client_secret = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-databricks-sp-client-secret")
tenant_id = dbutils.secrets.get(scope=secret_scope_name, key="tenant-id")
alpha_vantage_api_key = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-alpha-vantage-api-key")

storage_account_name = "fmdpstg2"
bronze_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/bronze"
silver_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/silver"
gold_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/gold"

In [0]:
configs = {
  f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net": "OAuth",
  f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net": client_id,
  f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net": client_secret,
  f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
}

for k, v in configs.items(): spark.conf.set(k, v)

In [0]:
from pyspark.sql.functions import current_timestamp, current_date, col, lit, to_date
import requests
import json
import time
from datetime import datetime
import hashlib

In [0]:
def av_time_series_daily_src_to_brz(symbol, outputsize="compact"):
    """
    Incrementally ingest TIME_SERIES_DAILY data from Alpha Vantage API to bronze layer
    
    Args:
        symbol (str): Stock ticker symbol
        outputsize (str): 'compact' (last 100 data points) or 'full' (20+ years of data)
    
    Returns:
        bool: True if ingestion successful, False otherwise
    """
    function = "TIME_SERIES_DAILY"
    batch_id = f"{datetime.now().strftime('%Y%m%d%H%M%S')}"
    ingestion_date = datetime.now().strftime('%Y-%m-%d')
    
    # Build URL with appropriate parameters
    url = f"https://www.alphavantage.co/query?function={function}&symbol={symbol}&outputsize={outputsize}&apikey={alpha_vantage_api_key}"
    
    try:
        # Make API request with retry logic
        for attempt in range(3):  # 3 retries
            try:
                response = requests.get(url, timeout=30)
                response.raise_for_status()
                raw_data = response.json()
                break
            except requests.exceptions.RequestException as e:
                if attempt == 2:  # Last attempt
                    raise
                print(f"Retrying request for {symbol} (attempt {attempt+1}/3)")
                time.sleep(5)  # Wait before retry
        
        # Check for errors or rate limiting
        if "Error Message" in raw_data:
            print(f"API Error for {symbol}: {raw_data['Error Message']}")
            return False
            
        if "Note" in raw_data and "API call frequency" in raw_data["Note"]:
            print(f"Rate limited for {symbol}: {raw_data['Note']}")
            time.sleep(15)  # Sleep and could retry
            return False
            
        # Verify data structure
        if "Meta Data" not in raw_data or "Time Series (Daily)" not in raw_data:
            print(f"Unexpected data structure for {symbol}")
            return False
            
        # Get last refreshed date from metadata
        last_refreshed = raw_data["Meta Data"].get("3. Last Refreshed", ingestion_date)
        
        # Generate response hash for change detection
        response_hash = hashlib.md5(json.dumps(raw_data, sort_keys=True).encode()).hexdigest()
        
        # Bronze table path
        bronze_table = f"{bronze_path}/brz_av_time_series_daily"
        
        # Check if we already have this exact data
        try:
            existing_df = spark.read.format("delta").load(bronze_table)
            existing_batch_rcount = existing_df.filter(
                (col("symbol") == symbol) & 
                (col("response_hash") == response_hash) &
                (col("last_refreshed") == last_refreshed)
            ).count()
            
            if existing_batch_rcount > 0:
                print(f"Skipping {symbol}: Data unchanged since last ingestion")
                return True
        except:
            # Table doesn't exist yet - first run
            pass
            
        # Create DataFrame with enhanced metadata
        df_brz = spark.createDataFrame([
            (
                symbol,                      # Symbol
                batch_id,                    # Batch ID
                outputsize,                  # Data volume requested
                last_refreshed,              # Date of last data point
                response_hash,               # Hash for change detection
                raw_data["Meta Data"].get("1. Information", ""),  # API info
                raw_data["Meta Data"].get("2. Symbol", symbol),   # Symbol from API
                json.dumps(raw_data)         # Full raw JSON
            )
        ], ["symbol", "batch_id", "outputsize", "last_refreshed", "response_hash", 
            "information", "api_symbol", "raw_data"])
        
        # Add ingestion metadata
        df_brz = df_brz.withColumn("ingestion_timestamp", current_timestamp())
        df_brz = df_brz.withColumn("ingestion_date", to_date(lit(ingestion_date)))
        
        # Write to Delta table
        df_brz.write \
            .format("delta") \
            .mode("append") \
            .option("mergeSchema", "true") \
            .partitionBy("symbol", "ingestion_date") \
            .save(bronze_table)
            
        print(f"Successfully ingested {symbol} TIME_SERIES_DAILY data (batch {batch_id})")
        return True
        
    except Exception as e:
        print(f"Error ingesting {symbol}: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [0]:
symbols = ["AAPL", "MSFT", "AMZN", "META", "NVDA", "TSLA", "GOOGL", "QQQ", "SPY"]
function = "TIME_SERIES_DAILY"

successful_symbols = []
failed_symbols = []

# Add timestamp for logging
run_start_time = datetime.now()
print(f"Starting data ingestion at {run_start_time}")

for i, symbol in enumerate(symbols):
    print(f"Processing {i+1}/{len(symbols)}: {symbol}")
    
    try:
        success = av_time_series_daily_src_to_brz(symbol)
        if success:
            successful_symbols.append(symbol)
        else:
            failed_symbols.append(symbol)
    except Exception as e:
        print(f"Error processing {symbol}: {str(e)}")
        failed_symbols.append(symbol)
    
    # Pause every 5 requests for 60 seconds to avoid hitting free tier limit
    if (i + 1) % 5 == 0 and i < len(symbols) - 1:  # Don't wait after the last batch
        print(f"API limit reached, sleeping for 60 seconds... ({i+1}/{len(symbols)} completed)")
        time.sleep(60)
    else:
        # Small delay between individual requests
        time.sleep(5)

# Summarize results
run_end_time = datetime.now()
duration = (run_end_time - run_start_time).total_seconds()

print(f"\n--- Ingestion Summary ---")
print(f"Run completed at: {run_end_time}")
print(f"Total duration: {duration:.2f} seconds")
print(f"Successful: {len(successful_symbols)}/{len(symbols)} ({', '.join(successful_symbols) if successful_symbols else 'None'})")
print(f"Failed: {len(failed_symbols)}/{len(symbols)} ({', '.join(failed_symbols) if failed_symbols else 'None'})")