### Ingest Order Payments Data Into Bronze Layer With Autoloader

Define Schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Define schema
schema = StructType([
    StructField("review_id", StringType()),
    StructField("order_id", StringType()),
    StructField("review_score", IntegerType()),
    StructField("review_comment_title", StringType()),
    StructField("review_comment_message", StringType()),
    StructField("review_creation_date", TimestampType()),
    StructField("review_answer_timestamp", TimestampType()),
    StructField("source_file", StringType()),
    StructField("source_file_timestamp", TimestampType()),
    StructField("_rescued_data", StringType())
])




In [0]:
checkpoint = "/Volumes/mycatalog/olist_ecommerce_bronze/checkpoints/order_reviews/"

Stream Read

In [0]:
df = spark.readStream\
    .option("header", True)\
    .schema(schema)\
    .format("cloudFiles")\
    .option("cloudFiles.format", "csv")\
    .option("cloudFiles.schemaEvolutionMode", "rescue")\
    .option("cloudFiles.schemaLocation", f"{checkpoint}/schema")\
    .load("/Volumes/mycatalog/olist_ecommerce/olist_landing/order_reviews")\
    .selectExpr("*", "_metadata")


In [0]:
df = df\
    .withColumn("source_file", df._metadata.file_name)\
    .withColumn("source_file_timestamp", df._metadata.file_modification_time.cast("timestamp"))\
    .drop("_metadata")

Stream Write

In [0]:
df.writeStream\
    .format("delta")\
    .outputMode("append")\
    .trigger(once=True)\
    .option("mergeSchema", "true")\
    .option("checkpointLocation", f"{checkpoint}/_checkpoint")\
    .toTable("mycatalog.olist_ecommerce_bronze.order_reviews")

Validate

In [0]:
%sql
SELECT * FROM mycatalog.olist_ecommerce_bronze.order_reviews LIMIT 5