# Install lib

In [0]:
%pip install -r requirements.txt
dbutils.library.restartPython()

# manage imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
from delta.tables import DeltaTable
import pandas as pd
import re
import unittest

## Spark configs

In [0]:
spark.conf.set("spark.sql.shuffle.partitions", "auto")
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

## Logger

In [0]:
import logging

def get_logger(name):
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
        )
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    return logger

logger = get_logger("ecommerce_pipeline")

### Storage Path

In [0]:
BASE_PATH = "/Volumes/ecommerceproject"
RAW_PATH = f"{BASE_PATH}/default"

# Create Volume If Not Exists

## Check file availability in Volumes

In [0]:
try:
    dbutils.fs.ls(f"{RAW_PATH}/customer/Customer.xlsx")
    logger.info("File exists")
except Exception:
    logger.error("File not found")
    raise

In [0]:
try:
    dbutils.fs.ls(f"{RAW_PATH}/orders/Orders.json")
    logger.info("File exists")
except Exception:
    logger.error("File not found")
    raise

In [0]:
try:
    dbutils.fs.ls(f"{RAW_PATH}/products/Products.csv")
    logger.info("File exists")
except Exception:
    logger.error("File NOT found")
    raise

# Create Hash Record

In [0]:
def add_hash(df, cols, hash_col="record_hash"):
    return df.withColumn(
        hash_col,
        sha2(concat_ws("||", *cols), 256)
    )

# Verify table exists under the catalog

In [0]:
def table_exists(table_name: str) -> bool:
    try:
        spark.sql(f"DESCRIBE TABLE {table_name}")
        return True
    except Exception:
        return False

# Update Col Case

In [0]:
def transform_col_case(col_name: str) -> str:
    col_name = re.sub(r"[^a-zA-Z0-9 ]", "", col_name)
    col_name = col_name.strip().lower().replace(" ", "_")
    col_name = re.sub(r"_+", "_", col_name)
    return col_name