In [0]:
secret_scope_name = "fmdp-secrets"
client_id = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-databricks-sp-client-id")
client_secret = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-databricks-sp-client-secret")
tenant_id = dbutils.secrets.get(scope=secret_scope_name, key="tenant-id")
alpha_vantage_api_key = dbutils.secrets.get(scope=secret_scope_name, key="fmdp-alpha-vantage-api-key")

storage_account_name = "fmdpstg2"
bronze_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/bronze"
silver_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/silver"
gold_path = f"abfss://financial-data@{storage_account_name}.dfs.core.windows.net/gold"

In [0]:
configs = {
  f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net": "OAuth",
  f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net": client_id,
  f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net": client_secret,
  f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
}

for k, v in configs.items(): spark.conf.set(k, v)

In [0]:
from pyspark.sql.functions import current_timestamp, current_date, col, lit, to_date, lag, avg, stddev, first, lit, min, max, datediff, when, abs, row_number, expr
from pyspark.sql.window import Window
import requests
import json
import time
from datetime import datetime
import hashlib

In [0]:
def av_time_series_daily_sil_to_gld():
    """
    Transform silver stock data to gold layer with financial metrics
    
    Args:
        silver_path (str): Path to silver data
        gold_path (str): Path to write gold data
    """
    print(f"Starting silver to gold transformation...")
    
    # Load silver data
    silver_table = f"{silver_path}/sil_av_time_series_daily"
    df_sil = spark.read.format("delta").load(silver_table)
    
    # Get the count of records by symbol for reporting
    symbol_counts = df_sil.groupBy("symbol").count().collect()
    symbol_count = len(symbol_counts)
    total_records = df_sil.count()
    print(f"Processing {total_records} records for {symbol_count} symbols")
    
    # Define window for time-based calculations
    symbol_window = Window.partitionBy("symbol").orderBy("date")
    
    # Daily return (%)
    df = df_sil.withColumn("daily_return_pct", 
        when(lag("close").over(symbol_window).isNull(), None)
        .otherwise(((col("close") - lag("close").over(symbol_window)) / lag("close").over(symbol_window)) * 100)
    )
    
    # 5-day and 20-day moving averages
    df = df.withColumn("ma_5", avg("close").over(symbol_window.rowsBetween(-4, 0)))
    df = df.withColumn("ma_20", avg("close").over(symbol_window.rowsBetween(-19, 0)))
    
    # Additional moving averages for longer trends
    df = df.withColumn("ma_50", avg("close").over(symbol_window.rowsBetween(-49, 0)))
    df = df.withColumn("ma_200", avg("close").over(symbol_window.rowsBetween(-199, 0)))
    
    # 5-day rolling volatility (stddev of daily return)
    df = df.withColumn("volatility_5d", stddev("daily_return_pct").over(symbol_window.rowsBetween(-4, 0)))
    
    # 20-day Volatility for longer-term view
    df = df.withColumn("volatility_20d", stddev("daily_return_pct").over(symbol_window.rowsBetween(-19, 0)))
    
    # Normalized close price (base = 100)
    first_close = first("close").over(Window.partitionBy("symbol").orderBy("date").rowsBetween(Window.unboundedPreceding, 0))
    df = df.withColumn("normalized_close", (col("close") / first_close) * 100)
    
    # Trading signals based on technical indicators
    df = df.withColumn("signal_ma_crossover", 
        when(
            (col("ma_5") > col("ma_20")) & (lag("ma_5").over(symbol_window) <= lag("ma_20").over(symbol_window)), 
            "BUY"
        ).when(
            (col("ma_5") < col("ma_20")) & (lag("ma_5").over(symbol_window) >= lag("ma_20").over(symbol_window)), 
            "SELL"
        ).otherwise("HOLD")
    )
    
    # Calculate Relative Strength (compared to market - assuming SPY is the market)
    # First, calculate daily performance for each symbol
    spy_data = df.filter(col("symbol") == "SPY").select("date", "daily_return_pct").withColumnRenamed("daily_return_pct", "spy_return")
    
    # Join with SPY data to compare performance
    df = df.join(spy_data, "date", "left")
    
    # Calculate relative strength
    df = df.withColumn("relative_strength", col("daily_return_pct") - col("spy_return"))
    
    # Add metadata columns
    df = df.withColumn("processing_timestamp", current_timestamp())
    
    # Write to gold zone
    gold_table = f"{gold_path}/gld_av_time_series_daily"
    df.write \
      .format("delta") \
      .mode("overwrite") \
      .option("overwriteSchema", "true") \
      .save(gold_table)
    
    # Get statistics for reporting
    gold_df = spark.read.format("delta").load(gold_table)
    gold_count = gold_df.count()
    
    print(f"Silver to gold transformation complete.")
    print(f"Wrote {gold_count} records to gold layer.")
    
    return True

In [0]:
av_time_series_daily_sil_to_gld()

In [0]:
df = spark.read.format('delta').load(f'{gold_path}/gld_av_time_series_daily')

In [0]:
display(df)