In [0]:
%run ../ecommerceproject/utils

# Rename Columns to Follow Delta Standard and Deduplicate Records

### Ingest Customers

In [0]:
logger.info("Reading Customers Excel")

pdf = pd.read_excel(
    "/Volumes/ecommerceproject/default/customer/Customer.xlsx",
    dtype=str,
    sheet_name=0,
    engine="openpyxl"
)

customers_df = spark.createDataFrame(pdf)
new_cols = [transform_col_case(c) for c in customers_df.columns]

customers_df = customers_df.toDF(*new_cols).dropDuplicates()

customers_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("ecommerceproject.default.raw_customers")

logger.info(f"Customers loaded: {customers_df.count()}")


### Ingest Orders

In [0]:
logger.info("Reading Orders JSON")
 
orders_df = spark.read.option("multiLine", True) \
       .option('inferSchema', 'true') \
       .json("/Volumes/ecommerceproject/default/orders/Orders.json")

new_cols = [transform_col_case(c) for c in orders_df.columns]
orders_df = orders_df.toDF(*new_cols).dropDuplicates()
orders_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("ecommerceproject.default.raw_orders")

logger.info(f"Orders loaded: {orders_df.count()}")

### Ingest Products

In [0]:
logger.info("Reading Products CSV")

products_df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .option("delimiter", ",")
         .option("multiLine", "true")
         .option("quote", '"')
         .option("escape", '"')
         .option("mode", "PERMISSIVE")
         .option("columnNameOfCorruptRecord", "_corrupt_record")
         .csv(f"{RAW_PATH}/products/Products.csv")
)

products_schema = StructType([
    StructField("price_per_product", DoubleType(), True)
])

new_cols = [transform_col_case(c) for c in products_df.columns]
products_df = products_df.toDF(*new_cols).dropDuplicates()
products_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("ecommerceproject.default.raw_products")

logger.info(f"Products loaded: {products_df.count()}")
