###### Using Service Principal Instead of Storage credential and External Location

In [0]:
# storage_account = "stnyctaxigreen"

# # Fetching from Azure Key Vault
# client_id = dbutils.secrets.get(scope="kv-nyctaxi-scope", key="sp-client-id")
# tenant_id = dbutils.secrets.get(scope="kv-nyctaxi-scope", key="sp-tenant-id")
# client_secret = dbutils.secrets.get(scope="kv-nyctaxi-scope", key="sp-client-secret")

# # Service Principle Authentication
# spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
# spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
# spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
# spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
# spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

##### LOADING FROM LANDING TO BRONZE

In [0]:
import re
import sys
import os
import importlib
from pyspark.sql.functions import current_timestamp, input_file_name, col, expr, regexp_extract

# path for module imports
root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root not in sys.path:
    sys.path.append(root)

# Force reload modules to ensure latest version
if 'modules.transformations.metadata' in sys.modules:
    importlib.reload(sys.modules['modules.transformations.metadata'])

from modules.transformations import add_processed_timestamp

In [0]:
# Paths and Configuration
volume_path = "/Volumes/nyctaxi/00_landing/data_sources/*/*"
landing_base = "/Volumes/nyctaxi/00_landing/data_sources/"
checkpoint_path = "abfss://bronze@stnyctaxigreen.dfs.core.windows.net/_checkpoints/green_taxi"
table_name = "nyctaxi.01_bronze.green_trips_raw"


# Using Auto Loader with Volume path
df_stream = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "parquet")
    .option("pathGlobFilter", "*.parquet")
    .option("cloudFiles.schemaLocation", checkpoint_path) 
    .load(volume_path))


# Metadata columns
df_time = add_processed_timestamp(df_stream)
df_final = df_time.withColumnRenamed("processed_timestamp", "load_timestamp") \
                    .withColumn("source_file", input_file_name())


# Writing data
query = df_final.writeStream \
            .option("checkpointLocation", checkpoint_path) \
            .trigger(availableNow=True) \
            .toTable(table_name)

query.awaitTermination()


# --- LOGGING ----

# We look for rows added in the last 5 minutes to identify the current batch
last_batch_df = spark.read.table(table_name).filter(col("load_timestamp") >= current_timestamp() - expr("INTERVAL 5 MINUTES"))

# Check if any rows were actually written
if last_batch_df.limit(1).count() == 0:
    print("No new data to load")
else:
    # Extract unique months from the source_file path
    loaded_months = last_batch_df.select(regexp_extract(col("source_file"), r"(\d{4}-\d{2})", 1).alias("m")) \
                                 .distinct().orderBy("m").collect()
    
    months = [r['m'] for r in loaded_months]
    
    # Format the range string
    date_range = f"{months[0]}" if len(months) == 1 else f"{months[0]} to {months[-1]}"
    
    print(f"Successfully loaded: {date_range} to {table_name}")

    print(f"Total records in {table_name}: {spark.read.table(table_name).count()}")