In [0]:
import dlt
import json
from pyspark.sql.functions import from_json, expr, lit, col, to_timestamp, regexp_extract, input_file_name
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

In [0]:
# Assign pipeline parameters to variables
catalog_name = spark.conf.get("catalog_name")
schema_name = spark.conf.get("schema_name")
landing_volume_name = spark.conf.get("landing_volume_name")

In [0]:
# Define variables
directory_name = "bike_point"
source_volume = f"/Volumes/{catalog_name}/{schema_name}/{landing_volume_name}/{directory_name}/"

In [0]:
@dlt.table(
  comment="Loads bike point JSON files from landing into bronze"
)
def bronze_bike_point():
  df = (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("inferSchema", "true")
        .option("header", "true")
        .load(source_volume)
  )
  
  # Add audit information to the table
  df = (
    df.withColumn("source_system", lit("tfl_api"))
    .withColumn("source_file", col("_metadata.file_name"))
    .withColumn("ingestion_user", expr("CURRENT_USER()"))
    .withColumn("landing_timestamp", col("_metadata.file_modification_time"))
    .withColumn("bronze_timestamp", expr("CURRENT_TIMESTAMP()"))
  )
  return df