In [0]:
%sql
-- 1Ô∏è‚É£ Create Catalog
CREATE CATALOG IF NOT EXISTS customer_360;

-- 2Ô∏è‚É£ Create Schemas for each layer
CREATE SCHEMA IF NOT EXISTS customer_360.customer_360_source;
CREATE SCHEMA IF NOT EXISTS customer_360.customer_360_bronze;
CREATE SCHEMA IF NOT EXISTS customer_360.customer_360_silver;
CREATE SCHEMA IF NOT EXISTS customer_360.customer_360_gold;

CREATE SCHEMA IF NOT EXISTS customer_360.audit;

-- 3Ô∏è‚É£ Create "Volumes" for each table (each volume corresponds to one table path)
-- Source Layer Volumes
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_source.source_product_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_source.source_customer_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_source.source_sales_volume;

-- Bronze Layer Volumes
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_bronze.bronze_product_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_bronze.bronze_customer_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_bronze.bronze_sales_volume;

-- Silver Layer Volumes
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_silver.silver_product_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_silver.silver_customer_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_silver.silver_sales_volume;

-- Gold Layer Volumes
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_gold.gold_sales_fact_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_gold.gold_product_dim_volume;
CREATE VOLUME IF NOT EXISTS customer_360.customer_360_gold.gold_customer_dim_volume;

-- audit
CREATE SCHEMA IF NOT EXISTS customer_360.audit;
CREATE VOLUME IF NOT EXISTS customer_360.audit.audit_volume;
-- Create the table inside the volume
CREATE TABLE IF NOT EXISTS delta.`/Volumes/customer_360/audit/audit_volume/etl_audit` (
    layer STRING,                  -- bronze / silver / gold
    table_name STRING,             -- e.g., bronze_customer
    load_time TIMESTAMP,           -- ETL run timestamp
    records_loaded BIGINT,         -- number of rows loaded in this run
    max_data_timestamp TIMESTAMP   -- max data_arrival_timestamp of rows loaded
)
USING DELTA;



DROP SCHEMA IF EXISTS customer_360.default CASCADE;


In [0]:
from pyspark.sql import SparkSession
import pandas as pd
import random
from faker import Faker
from datetime import datetime
from datetime import timedelta


spark = SparkSession.builder.getOrCreate()
fake = Faker()
CURRENT_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
DATA_ARRIVAL_TS = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# -------------------------
# Helper Functions
# -------------------------
def rare_count(min_val, max_val, mostly_zero=True):
    """
    Generate random numbers with bias towards zero (or small numbers)
    mostly_zero=True: 80% chance to be 0
    """
    if mostly_zero:
        return random.choices(
            range(min_val, max_val+1), 
            weights=[0.8 if i==0 else 0.2/(max_val-min_val) for i in range(min_val, max_val+1)]
        )[0]
    else:
        return random.randint(min_val, max_val)

def load_existing_csv(path):
    import glob, os
    files = glob.glob(f"{path}/*.csv")
    if files:
        df_list = [pd.read_csv(f) for f in files]
        df = pd.concat(df_list, ignore_index=True)
        df = df.drop_duplicates(subset=[df.columns[0]])  # Remove duplicates by ID
        return df
    else:
        return pd.DataFrame()

def generate_ids(prefix, n, existing_ids):
    if n <= 0:
        return []
    start_id = 1
    if existing_ids:
        nums = [int(i.replace(prefix,"")) for i in existing_ids if i.startswith(prefix)]
        if nums:
            start_id = max(nums) + 1
    return [f"{prefix}{str(i).zfill(4)}" for i in range(start_id, start_id+n)]

def save_unique_csv(df_new, df_existing, path, file_prefix):
    if df_new.empty:
        print(f"‚òëÔ∏è No new rows to save for {file_prefix}")
        return
    if not df_existing.empty:
        existing_ids = set(df_existing[df_existing.columns[0]].tolist())
        df_new = df_new[~df_new[df_new.columns[0]].isin(existing_ids)]
    if not df_new.empty:
        file_path = f"{path}/{file_prefix}_{CURRENT_TS}.csv"
        df_new.to_csv(file_path, index=False)
        print(f"‚úÖ Saved {len(df_new)} new rows to {file_path}")
    else:
        print(f"‚òëÔ∏è No new rows to save for {file_prefix}")

# -------------------------
# Configurable Random Counts
# -------------------------
NUM_CUSTOMERS = rare_count(0, 4, mostly_zero=True)  # mostly 0 or 1
NUM_PRODUCTS = rare_count(0, 2, mostly_zero=True)   # mostly 0
NUM_SALES = rare_count(0, 21, mostly_zero=False)

# ----------------------------------------------------------------------------------------
# IMPORTANT:Uncomment the below 3 lines for initial load. Comment out after initial load
# ----------------------------------------------------------------------------------------

# NUM_CUSTOMERS = 3 
# NUM_PRODUCTS = 2   
# NUM_SALES = 10


# -------------------------
# Paths
# -------------------------
CUSTOMER_VOLUME = "/Volumes/customer_360/customer_360_source/source_customer_volume"
PRODUCT_VOLUME = "/Volumes/customer_360/customer_360_source/source_product_volume"
SALES_VOLUME = "/Volumes/customer_360/customer_360_source/source_sales_volume"

# -------------------------
# CUSTOMER
# -------------------------
df_customer_existing = load_existing_csv(CUSTOMER_VOLUME)
existing_cust_ids = df_customer_existing["customer_id"].tolist() if not df_customer_existing.empty else []
customer_ids = generate_ids("CUST", NUM_CUSTOMERS, existing_cust_ids)

customer_data = []
for cid in customer_ids:
    customer_data.append({
        "customer_id": cid,
        "customer_name": fake.name(),
        "segment": random.choice(["Consumer","Corporate","Home Office"]),
        "age": random.randint(18,70),
        "country": fake.country(),
        "city": fake.city(),
        "state": fake.state(),
        "postal_code": fake.postcode(),
        "region": random.choice(["East","West","South","North"]),
        "data_arrival_timestamp": DATA_ARRIVAL_TS
    })

df_customer_new = pd.DataFrame(customer_data)
df_customer = pd.concat([df_customer_existing, df_customer_new], ignore_index=True) if not df_customer_existing.empty else df_customer_new
df_customer = df_customer.drop_duplicates(subset=["customer_id"])
save_unique_csv(df_customer_new, df_customer_existing, CUSTOMER_VOLUME, "customer")

# -------------------------
# PRODUCT
# -------------------------
df_product_existing = load_existing_csv(PRODUCT_VOLUME)
existing_prod_ids = df_product_existing["product_id"].tolist() if not df_product_existing.empty else []
product_ids = generate_ids("PROD", NUM_PRODUCTS, existing_prod_ids)

categories = {
    "Technology": ["Laptop","Phone","Tablet","Camera","Speaker","Monitor","Smartwatch","Printer","Router","Headphones","Projector","Drone","Keyboard","Mouse"],
    "Furniture": ["Chair","Table","Desk","Shelf","Sofa","Cabinet","Bed","Stool","Couch","Wardrobe","Nightstand","Bench"],
    "Office Supplies": ["Pen","Notebook","Binder","Stapler","Paper","Envelope","Marker","Folder","Tape","Calculator","Highlighter","Clipboard"],
    "Sports & Outdoors": ["Bicycle","Treadmill","Dumbbell","Tent","Backpack","Yoga Mat","Helmet","Running Shoes","Kayak","Golf Club","Hiking Boots","Soccer Ball"]
}

adjectives = ["Ultra","Pro","Max","Mini","Smart","Eco","Advanced","Premium","Deluxe","Compact","Portable","Elite","I-"]

def generate_product_name_with_category(category):
    noun = random.choice(categories[category])
    adj = random.choice(adjectives)
    number = random.randint(10,99)
    return f"{adj} {noun} {number}"

product_data = []
for pid in product_ids:
    category = random.choice(list(categories.keys()))
    product_data.append({
        "product_id": pid,
        "category": category,
        "product_name": generate_product_name_with_category(category),
        "data_arrival_timestamp": DATA_ARRIVAL_TS
    })

df_product_new = pd.DataFrame(product_data)
df_product = pd.concat([df_product_existing, df_product_new], ignore_index=True) if not df_product_existing.empty else df_product_new
df_product = df_product.drop_duplicates(subset=["product_id"])
save_unique_csv(df_product_new, df_product_existing, PRODUCT_VOLUME, "product")

# -------------------------
# SALES
# -------------------------
df_sales_existing = load_existing_csv(SALES_VOLUME)
existing_sales_ids = df_sales_existing["order_id"].tolist() if not df_sales_existing.empty else []
sales_ids = generate_ids("ORDER", NUM_SALES, existing_sales_ids)

available_customers = df_customer["customer_id"].tolist() if not df_customer.empty else []
available_products = df_product["product_id"].tolist() if not df_product.empty else []

if not available_customers or not available_products:
    print("‚ö†Ô∏è Customers or products not available. Skipping sales generation.")
    sales_data = []
else:
    sales_data = []
    for oid in sales_ids:
        order_date = fake.date_between(start_date='-1y', end_date='today')
        # Ship date 1‚Äì10 days after order_date
        ship_date = order_date + timedelta(days=random.randint(1, 10))
        sales_data.append({
            "order_id": oid,
            "order_date": order_date,
            "ship_date": ship_date,
            "ship_mode": random.choice(["Standard Class","Second Class","First Class","Same Day"]),
            "customer_id": random.choice(available_customers),
            "product_id": random.choice(available_products),
            "sales": round(random.uniform(20,1000),2),
            "quantity": random.randint(1,5),
            "discount": round(random.uniform(0,0.5),2),
            "profit": round(random.uniform(5,200),2),
            "data_arrival_timestamp": DATA_ARRIVAL_TS
        })

df_sales_new = pd.DataFrame(sales_data)
df_sales = pd.concat([df_sales_existing, df_sales_new], ignore_index=True) if not df_sales_existing.empty else df_sales_new
df_sales = df_sales.drop_duplicates(subset=["order_id"])
save_unique_csv(df_sales_new, df_sales_existing, SALES_VOLUME, "sales")

print("üü¢ Data generation completed successfully!")


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Folder path
customer_folder = "/Volumes/customer_360/customer_360_source/source_customer_volume"
product_folder = "/Volumes/customer_360/customer_360_source/source_product_volume"
sales_folder = "/Volumes/customer_360/customer_360_source/source_sales_volume"

# Read all CSVs in folder
df_customers = spark.read.option("header", True).csv(customer_folder + "/*.csv")
df_products = spark.read.option("header", True).csv(product_folder + "/*.csv")
df_sales = spark.read.option("header", True).csv(sales_folder + "/*.csv")

# Show sample
# df_customers.display(5)
# df_products.display(5)
# df_sales.display(5)
