### Ingest Geo Location Data Into Bronze Layer With Autoloader

Define Schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define custom schema
geolocation_schema = StructType([
    StructField("geolocation_zip_code_prefix", IntegerType(), True),
    StructField("geolocation_lat", DoubleType(), True),
    StructField("geolocation_lng", DoubleType(), True),
    StructField("geolocation_city", StringType(), True),
    StructField("geolocation_state", StringType(), True)
])

In [0]:
geolocation_checkpoint = "/Volumes/mycatalog/olist_ecommerce_bronze/checkpoints/geolocation/"

Stream Read

In [0]:
df = spark.readStream\
    .option("header", True)\
    .schema(geolocation_schema)\
    .format("cloudFiles")\
    .option("cloudFiles.format", "csv")\
    .option("cloudFiles.schemaEvolutionMode", "rescue")\
    .option("cloudFiles.schemaLocation", f"{geolocation_checkpoint}/schema")\
    .load("/Volumes/mycatalog/olist_ecommerce/olist_landing/geolocation")


Stream Write

In [0]:
df.writeStream\
    .format("delta")\
    .outputMode("append")\
    .trigger(once=True)\
    .option("mergeSchema", "true")\
    .option("checkpointLocation", f"{geolocation_checkpoint}/_checkpoint")\
    .toTable("mycatalog.olist_ecommerce_bronze.geolocation")

Validate

In [0]:
%sql
SELECT * FROM mycatalog.olist_ecommerce_bronze.geolocation LIMIT 5