In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType
import time

# ----------------------------
# Paths
# ----------------------------
SOURCE_DIR = "/Volumes/otc/volumn/landingfiles/streaming_data/generated_data"
PROCESSED_DIR = "/Volumes/otc/volumn/landingfiles/streaming_data/processed_data"
CHECKPOINT_DIR = "/Volumes/otc/volumn/landingfiles/streaming_data/_checkpoints/processed_data"

dbutils.fs.mkdirs(SOURCE_DIR)
dbutils.fs.mkdirs(PROCESSED_DIR)
dbutils.fs.mkdirs(CHECKPOINT_DIR)

In [0]:
schema = StructType([
    StructField("emp_id", LongType(), True),
    StructField("emp_code", StringType(), True),
    StructField("name", StringType(), True),
    StructField("salary", IntegerType(), True),
])

In [0]:
raw = (
    spark.readStream\
        .format('csv')\
            .schema(schema)\
            .option('header', 'true')\
                .option('maxFilesPerTrigger', 1)\
                    .load(SOURCE_DIR)
)

In [0]:
processed = (
    raw.select(
            F.col("emp_id").cast("long").alias("emp_id"),
            F.col("emp_code").alias("emp_code"),
            F.col("name").alias("name"),
            F.col("salary").cast("int").alias("salary"),
    )
    .withColumn('event_ts', F.current_timestamp())
    .withColumn('source_file', F.input_file_name())
)

In [0]:
q = (
    processed.writeStream\
        .format('delta')\
            .outputMode('append')\
                .option('checkpointLocation', CHECKPOINT_DIR)\
                    .start(PROCESSED_DIR)
)