In [0]:
RAW_PATH = "/Volumes/ecommerce/ecommerce/data"
GCP_PROJECT = "regal-elf-481622-u5"
BQ_DATASET = "ecommerce"
TEMP_GCS_BUCKET = "ecom-databricks-temp"

GCP_SECRET_SCOPE = "gcp-secrets"
GCP_SECRET_KEY = "gcp-sa-key"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql.functions import lit

txn_schema = StructType([
    StructField("transaction_id", IntegerType(), True ),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("store_id", IntegerType(), True),
    StructField("promotion_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("transaction_date", TimestampType(), True)
])

cust_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("join_date", TimestampType(), True),
    StructField("country", StringType(), True)
])

prod_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("supplier_name", StringType(), True)
])

store_schema = StructType([
    StructField("store_id", IntegerType(), True),
    StructField("store_name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True)
])

promo_schema = StructType([
    StructField("promotion_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("promotion_method", StringType(), True),
    StructField("discount_percent", DoubleType(), True),
    StructField("start_date", TimestampType(), True),
    StructField("end_date", TimestampType(), True)
])

feedback_schema = StructType([
    StructField("feedback_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("review_date", TimestampType(), True),
    StructField("comments", StringType(), True)
])

txn_df = spark.read.option("header", True).schema(txn_schema).csv(RAW_PATH + "/transactions.csv")
cust_df = spark.read.option("header", True).schema(cust_schema).csv(RAW_PATH + "/customers.csv")
prod_df = spark.read.option("header", True).schema(prod_schema).csv(RAW_PATH + "/products.csv")
store_df = spark.read.option("header", True).schema(store_schema).csv(RAW_PATH + "/stores.csv")
promo_df = spark.read.option("header", True).schema(promo_schema).csv(RAW_PATH + "/promotions.csv")
feedback_df = spark.read.option("header", True).schema(feedback_schema).csv(RAW_PATH + "/feedback.csv")

print("Transactions: ", txn_df.count())
txn_df.printSchema()

In [0]:
txn_df.selectExpr("count(distinct transaction_id) as unique_transactions").show()

txn_df.filter("transaction_id is NULL or total_amount is NULL").show()

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS ecommerce.bronze;
CREATE SCHEMA IF NOT EXISTS ecommerce.silver;
CREATE SCHEMA IF NOT EXISTS ecommerce.gold;


In [0]:
txn_df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("ecommerce.bronze.transactions")
cust_df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("ecommerce.bronze.customers")
prod_df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("ecommerce.bronze.products")
store_df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("ecommerce.bronze.stores")
promo_df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("ecommerce.bronze.promotions")
feedback_df.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("ecommerce.bronze.feedback")
