In [0]:
# MAGIC %load_ext autoreload
# MAGIC %autoreload 2

In [0]:
import sys 
import os 
import pyspark.sql.functions as f 
sys.path.append(os.path.abspath('..'))

In [0]:
from ingestion.data_load_autoloader import load_raw_data_with_schema_evolution
from ingestion.data_load_excel import load_raw_data

#### Start Loading

In [0]:
CATALOG = "pei"
STORAGE_BASE = "abfss://retail@peiretailstorageaccount.dfs.core.windows.net/landing"

#####1. Load Raw Orders

In [0]:
order_load_options = {
    "badRecordsPath": f"{STORAGE_BASE}/orders",
    "cloudFiles.schemaLocation": f"/Volumes/{CATALOG}/default/checkpoint_volume/raw_orders_schema_checkpoint"
}

In [0]:
try:
  orders_df = load_raw_data_with_schema_evolution(spark_session=spark, 
                                          source_path=f"{STORAGE_BASE}/orders/*.json",
                                          file_format="json",
                                          additional_options=order_load_options)


  ( orders_df
    .withColumn("file_path", f.col("_metadata.file_path")) 
    .withColumn("ingestion_timestamp", f.current_timestamp()) 
    .writeStream
    .option("checkpointLocation", f'/Volumes/{CATALOG}/default/checkpoint_volume/raw_orders_file_checkpoint') 
    .outputMode("append")
    .trigger(availableNow=True)
    .toTable(f"{CATALOG}.bronze.raw_orders")
  )             
except Exception as e: 
  print(f"FAILED: Orders Ingestion. Error: {str(e)}")               
        

#####2. Load Raw Products

In [0]:
product_load_options = {
    "badRecordsPath": f"{STORAGE_BASE}/products",
    "cloudFiles.schemaLocation": f"/Volumes/{CATALOG}/default/checkpoint_volume/raw_products_schema_checkpoint"
}

In [0]:
try:
  products_df = load_raw_data_with_schema_evolution(spark_session=spark, 
                                          source_path=f"{STORAGE_BASE}/products/*.csv",
                                          file_format="csv",
                                          additional_options=product_load_options)


  ( products_df
    .withColumn("file_path", f.col("_metadata.file_path")) 
    .withColumn("ingestion_timestamp", f.current_timestamp()) 
    .writeStream
    .option("checkpointLocation", f'/Volumes/{CATALOG}/default/checkpoint_volume/raw_products_file_checkpoint') 
    .outputMode("append")
    .trigger(availableNow=True)
    .toTable(f"{CATALOG}.bronze.raw_products")
  )  
except Exception as e: 
  print(f"FAILED: Products Ingestion. Error: {str(e)}")

#####3. Load Raw Customers

In [0]:
data_path = f"/Volumes/pei/default/landing_volume/customers/"
processed_path = f"/Volumes/pei/default/landing_volume/customers/processed/"
file_name = "Customer.xlsx"
full_path = os.path.join(data_path, file_name)

if os.path.exists(full_path):
    try: 
        customer_df = load_raw_data(spark_session=spark, file_path=full_path)

        (
            customer_df
            .withColumn("file_path", f.lit(full_path)) 
            .withColumn("ingestion_timestamp", f.current_timestamp() )
            .write
            .format("delta")
            .option("mergeSchema", "true")
            .mode("append")
            .saveAsTable("pei.bronze.raw_customers")
        )

        dbutils.fs.mv(full_path, os.path.join(processed_path, file_name))
    except Exception as e: 
        print(f"FAILED: Customers Ingestion (Excel). Error: {str(e)}")
else:
    print(f"SKIPPED: {file_name} not found in {data_path}")