In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import * 

In [0]:
import logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

In [0]:
spark = SparkSession.builder.appName("spark_consumer").getOrCreate()

In [0]:
sharedaccesskey = dbutils.secrets.get(
    scope="himalayanjava",
    key="eventhubkey"
)

connection_string = (
    f"Endpoint=sb://himalayanjavahubstd.servicebus.windows.net/;"
    f"SharedAccessKeyName=kafka_producer;"
    f"SharedAccessKey={sharedaccesskey};"
    f"EntityPath=coffeehouse_orders"
)

eh_kafka_options = {
    "kafka.bootstrap.servers": "himalayanjavahubstd.servicebus.windows.net:9093",
    "subscribe": "coffeehouse_orders",
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config":
        # Note the "kafkashaded." prefix added below
        'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required '
        'username="$ConnectionString" '
        f'password="{connection_string}";',
    "startingOffsets": "earliest",
    "failOnDataLoss": "false"
}

df_batch = (
    spark.readStream
        .format("kafka")
        .options(**eh_kafka_options)
        .load()
)


In [0]:
%sql
CREATE CATALOG IF NOT EXISTS hj_orders
MANAGED LOCATION 'abfss://raw@himalayanjavadl.dfs.core.windows.net/metadata/';

In [0]:
%sql
USE CATALOG hj_orders;

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS bronze
MANAGED LOCATION 'abfss://raw@himalayanjavadl.dfs.core.windows.net/bronze/';

In [0]:
%sql
CREATE EXTERNAL VOLUME IF NOT EXISTS bronze.checkpoints
LOCATION 'abfss://raw@himalayanjavadl.dfs.core.windows.net/checkpoints/bronze_ingest/';

In [0]:
checkpoint_path = "/Volumes/hj_orders/bronze/checkpoints/bronze_ingest"

df_bronze = df_batch.withColumn("ingestion_timestamp", current_timestamp())

query_bronze = (df_bronze.writeStream
                .format("delta")
                .trigger(availableNow=True)
                .option("checkpointLocation", checkpoint_path)
                .outputMode("append")
                .toTable("hj_orders.bronze.orders_raw"))

print("Bronze Layer: Raw kafka being ingested to bronze schema")

In [0]:
display(spark.table("hj_orders.bronze.orders_raw"))

In [0]:
display(df_final)