In [0]:
secret_scope_name = "fmdp-secrets"
client_id = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-databricks-sp-client-id")
client_secret = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-databricks-sp-client-secret")
tenant_id = dbutils.secrets.get(scope=secret_scope_name, key="tenant-id")
alpha_vantage_api_key = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-alpha-vantage-api-key")

storage_account_name = "fmdpstg2"
bronze_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/bronze"
silver_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/silver"
gold_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/gold"

In [0]:
configs = {
  f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net": "OAuth",
  f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net": client_id,
  f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net": client_secret,
  f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
}

for k, v in configs.items(): spark.conf.set(k, v)

In [0]:
from pyspark.sql.functions import current_timestamp, current_date, col, lit, to_date, row_number, from_json, schema_of_json, explode, map_keys
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, LongType, BooleanType, TimestampType
from pyspark.sql.window import Window
import requests
import json
import time
from datetime import datetime
import hashlib

In [0]:
def av_time_series_daily_brz_to_sil(symbols=None, start_date=None, end_date=None):
    """
    Transform TIME_SERIES_DAILY data from bronze to silver layer.
    
    Args:
        symbols (list, optional): List of stock symbols to process. If None, process all symbols.
        start_date (str, optional): Start date for processing in YYYY-MM-DD format
        end_date (str, optional): End date for processing in YYYY-MM-DD format
    
    Returns:
        bool: True if transformation successful, False otherwise
    """
    try:
        # Import necessary functions
        from pyspark.sql.functions import col, min as min_func, max as max_func
        from datetime import datetime
        import json
        
        # Define table paths
        bronze_table = f"{bronze_path}/brz_av_time_series_daily"
        silver_table = f"{silver_path}/sil_av_time_series_daily"
        
        # Create silver directory if it doesn't exist
        silver_dir = silver_path
        dbutils.fs.mkdirs(silver_dir)
        
        # Start time for performance tracking
        start_time = datetime.now()
        print(f"Starting silver transformation at {start_time}")
        
        # Define explicit schema for silver layer
        silver_schema = StructType([
            StructField("symbol", StringType(), False),
            StructField("date", DateType(), False),
            StructField("open", DoubleType(), True),
            StructField("high", DoubleType(), True),
            StructField("low", DoubleType(), True),
            StructField("close", DoubleType(), True),
            StructField("volume", LongType(), True),
            StructField("source_batch_id", StringType(), True),
            StructField("is_valid", BooleanType(), False),
            StructField("processing_date", DateType(), False),
            StructField("processing_timestamp", TimestampType(), False)
        ])
        
        # Read from bronze layer
        print("Reading from bronze layer...")
        bronze_df = spark.read.format("delta").load(bronze_table)
        
        # Filter by symbols if provided
        if symbols:
            bronze_df = bronze_df.filter(col("symbol").isin(symbols))
        
        # Filter by ingestion date if provided
        if start_date:
            bronze_df = bronze_df.filter(col("ingestion_date") >= start_date)
        if end_date:
            bronze_df = bronze_df.filter(col("ingestion_date") <= end_date)
            
        # Get latest data for each symbol to avoid processing old snapshots
        print("Identifying latest data for each symbol...")
        window_spec = Window.partitionBy("symbol").orderBy(col("ingestion_timestamp").desc())
        latest_df = bronze_df.withColumn("row_num", row_number().over(window_spec)) \
                            .filter(col("row_num") == 1) \
                            .drop("row_num")
        
        # Count symbols being processed
        symbol_count = latest_df.select("symbol").distinct().count()
        if symbol_count == 0:
            print("No data to process. Exiting.")
            return True
            
        print(f"Processing latest data for {symbol_count} symbols")
        
        # Create an empty DataFrame with our desired schema
        empty_rdd = spark.sparkContext.emptyRDD()
        silver_df = spark.createDataFrame(empty_rdd, silver_schema)
        
        # Process each symbol individually to handle the complex nested structure
        for symbol_row in latest_df.collect():
            symbol = symbol_row.symbol
            batch_id = symbol_row.batch_id
            raw_data = json.loads(symbol_row.raw_data)
            
            print(f"Processing symbol: {symbol}")
            
            # Extract time series data
            time_series_data = raw_data.get("Time Series (Daily)", {})
            
            # Create rows for each date
            rows = []
            for date_str, daily_data in time_series_data.items():
                try:
                    # Parse values with error handling
                    open_price = float(daily_data.get("1. open", 0)) if daily_data.get("1. open") else None
                    high_price = float(daily_data.get("2. high", 0)) if daily_data.get("2. high") else None
                    low_price = float(daily_data.get("3. low", 0)) if daily_data.get("3. low") else None
                    close_price = float(daily_data.get("4. close", 0)) if daily_data.get("4. close") else None
                    volume = int(daily_data.get("5. volume", 0)) if daily_data.get("5. volume") else None
                    
                    # Check if data is valid
                    is_valid = (open_price is not None and 
                               high_price is not None and 
                               low_price is not None and 
                               close_price is not None and 
                               volume is not None)
                    
                    # Create a row
                    rows.append((
                        symbol,                         # symbol
                        datetime.strptime(date_str, "%Y-%m-%d").date(),  # date
                        open_price,                     # open
                        high_price,                     # high
                        low_price,                      # low
                        close_price,                    # close
                        volume,                         # volume
                        batch_id,                       # source_batch_id
                        is_valid,                       # is_valid
                        datetime.now().date(),          # processing_date
                        datetime.now()                  # processing_timestamp
                    ))
                except Exception as e:
                    print(f"Error processing {symbol} for date {date_str}: {str(e)}")
            
            # Create a DataFrame from the rows for this symbol
            if rows:
                symbol_df = spark.createDataFrame(rows, silver_schema)
                
                # Union with the main DataFrame
                silver_df = silver_df.union(symbol_df)
        
        # Count the records before writing
        total_records = silver_df.count()
        print(f"Processed {total_records} records for {symbol_count} symbols")
        
        if total_records == 0:
            print("No records to write. Exiting.")
            return True
            
        # Write to silver layer
        print("Writing to silver layer...")
        
        # Just use overwrite mode for simplicity since this is the initial load
        silver_df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .partitionBy("symbol") \
            .save(silver_table)
            
        print("Successfully wrote data to silver layer")
        
        # Calculate statistics for reporting
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        # Get record counts
        valid_records = silver_df.filter(col("is_valid") == True).count()
        invalid_records = total_records - valid_records
        
        # Calculate date range correctly using spark functions
        date_min = silver_df.agg(min_func("date")).collect()[0][0]
        date_max = silver_df.agg(max_func("date")).collect()[0][0]
        
        # Print summary
        print("\n--- Silver Transformation Summary ---")
        print(f"Completed at: {end_time}")
        print(f"Duration: {duration:.2f} seconds")
        print(f"Symbols processed: {symbol_count}")
        print(f"Total records: {total_records}")
        print(f"Valid records: {valid_records}")
        print(f"Invalid records: {invalid_records}")
        print(f"Date range: {date_min} to {date_max}")
        
        return True
        
    except Exception as e:
        print(f"Error in silver transformation: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [0]:
def process_all_bronze_to_silver():
    """
    Process all data from the bronze layer to the silver layer for TIME_SERIES_DAILY.
    Handles all available symbols and dates without filtering.
    """
    try:
        # Import necessary functions
        from pyspark.sql.functions import col, min as min_func, max as max_func
        
        print("Starting full bronze to silver transformation for all historical data...")
        
        # Get list of all symbols in the bronze layer
        bronze_table = f"{bronze_path}/brz_av_time_series_daily"
        
        # Read the bronze table to get all unique symbols
        bronze_df = spark.read.format("delta").load(bronze_table)
        all_symbols = [row.symbol for row in bronze_df.select("symbol").distinct().collect()]
        
        symbol_count = len(all_symbols)
        print(f"Found {symbol_count} symbols in bronze layer: {', '.join(all_symbols)}")
        
        # Call the transformation function with no filters to process everything
        success = av_time_series_daily_brz_to_sil()
        
        if success:
            # Verify the data in silver layer
            silver_table = f"{silver_path}/sil_av_time_series_daily"
            silver_df = spark.read.format("delta").load(silver_table)
            
            # Get statistics using proper column references
            total_records = silver_df.count()
            symbols_processed = silver_df.select("symbol").distinct().count()
            min_date = silver_df.agg(min_func("date")).collect()[0][0]
            max_date = silver_df.agg(max_func("date")).collect()[0][0]
            
            print("\n--- Silver Layer Data Summary ---")
            print(f"Total symbols: {symbols_processed}/{symbol_count}")
            print(f"Total records: {total_records}")
            print(f"Date range: {min_date} to {max_date}")
            
            # Print record counts by symbol
            print("\nRecords by symbol:")
            symbol_counts = silver_df.groupBy("symbol").count().orderBy("symbol")
            for row in symbol_counts.collect():
                print(f"  {row.symbol}: {row.count} records")
                
            return True
        else:
            print("Transformation failed. Check logs for details.")
            return False
            
    except Exception as e:
        print(f"Error in full bronze to silver processing: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [0]:
process_all_bronze_to_silver()