In [0]:
%run "../0 - SETUP/0 - Setup"

### 1. Create The Sales Table Schema

In [0]:
lakebase_catalog_name = CATALOG_NAME
catalog_name = ANALYTICS_CATALOG_NAME

lakebase_schema_name= POSTGRES_SCHEMA
schema_name = ANALYTICS_SCHEMA_NAME

In [0]:

spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
spark.sql(f'USE CATALOG {catalog_name}')


spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
spark.sql(f"USE SCHEMA {schema_name}")


### 2. Generate synthetic sales data

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import random
from datetime import datetime, timedelta

# Get distinct warehouse_id, category_id, sku_id combinations from inventory_items
items_df = spark.sql(f"""
SELECT DISTINCT warehouse_id, ii.sku_id, category_id
FROM {lakebase_catalog_name}.{lakebase_schema_name}.inventory_items ii
JOIN {lakebase_catalog_name}.{lakebase_schema_name}.inventory_sku isk
ON ii.sku_id = isk.sku_id
""")

# Generate date-hour range for past 3 years till yesterday
start_date = datetime.now() - timedelta(days=3*365)
end_date = datetime.now() - timedelta(days=1)
total_hours = int((end_date - start_date).total_seconds() // 3600) + 1
datetime_list = [(start_date + timedelta(hours=x)).replace(minute=0, second=0, microsecond=0) for x in range(total_hours)]
datetimes_df = spark.createDataFrame([(dt,) for dt in datetime_list], ["transaction_ts"])

# Cross join items_with_category_df with datetimes_df
sales_base_df = items_df.crossJoin(datetimes_df)

# Add random quantity (1-100)
sales_df = sales_base_df.withColumn("quantity", F.expr("CAST(FLOOR(rand() * 100) + 1 AS INT)"))\
    .withColumn("date", F.date_format(F.col("transaction_ts"), "yyyy-MM-dd"))

# Select and order columns as requested
synthetic_sales_df = sales_df.select(
   "transaction_ts", "date",  "warehouse_id", "category_id", "sku_id", "quantity", 
)

display(synthetic_sales_df)

### 3. Save the data as a table

In [0]:
spark.sql(f"DROP TABLE IF EXISTS {catalog_name}.{schema_name}.store_sales")

synthetic_sales_df.write.saveAsTable(f"{catalog_name}.{schema_name}.store_sales", mode="overwrite")

### Next Steps
- Since synthetic data generation is completed. Proceed to the DemandForecasting notebook 