## Leer data Delta Silver

In [32]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# Variables
container_name = "datalake"
root_silver_folder = "silver"
root_processed_folder = "processed"

show_debug = False

# Crear sesión Spark
spark = SparkSession.builder.appName("ReadDeltaData").getOrCreate()

# Rutas del Contenedor en ADLS Gen2
sales_delta_path = f"abfss://{container_name}@adlsstoresproject.dfs.core.windows.net/{root_silver_folder}/sales/"
customers_delta_path = f"abfss://{container_name}@adlsstoresproject.dfs.core.windows.net/{root_silver_folder}/customers/"
products_delta_path = f"abfss://{container_name}@adlsstoresproject.dfs.core.windows.net/{root_processed_folder}/products/"
suppliers_delta_path = f"abfss://{container_name}@adlsstoresproject.dfs.core.windows.net/{root_processed_folder}/suppliers/"

# Leer los archivos Delta
df_products = spark.read.format("delta").load(products_delta_path)
df_suppliers = spark.read.format("delta").load(suppliers_delta_path)

# Estos se deben leer filtrados para únicamente obtener los registros que no se han validado
df_sales = (
    spark.read.format("delta")
    .load(sales_delta_path)
    .where((col("is_validated") == False) | (col("is_validated") == 0))
)

df_customers = (
    spark.read.format("delta")
    .load(customers_delta_path)
    .where((col("is_validated") == False) | (col("is_validated") == 0))
)

# Mostrar los primeros registros
if (show_debug == True):
    print("Sales:")
    df_sales.show(3)

    print("Customers:")
    df_customers.show(3)

    print("Products:")
    df_products.show(3)

    print("Suppliers:")
    df_suppliers.show(3)

StatementMeta(sparkpoolnew, 61, 33, Finished, Available, Finished)

## Creación tablas de capa Gold

In [33]:
from pyspark.sql import functions as F

# Tabla Gold: Cantidad de ventas por store y año
gold_sales_count_by_store_year = (
    df_sales.groupBy("store", "year")
    .agg(F.count("*").alias("sales_count"))
    .orderBy("store", "year")
)

# Tabla Gold: Total de ventas por store y año
gold_sales_total_by_store_year = (
    df_sales.groupBy("store", "year")
    .agg(F.sum("total_amount").alias("total_sales"))
    .orderBy("store", "year")
)

# Tabla Gold: Top 15 productos más vendidos
gold_top15_products = (
    df_sales.groupBy("product_id")
    .agg(F.sum("quantity").alias("total_quantity"))
    .join(df_products.select("product_id", "product_name"), "product_id", "left")
    .orderBy(F.desc("total_quantity"))
    .limit(15)
)

# Mostrar resultados
if (show_debug == True):
    gold_sales_count_by_store_year.show()
    gold_sales_total_by_store_year.show()
    gold_top15_products.select("product_id", "product_name", "total_quantity").show()

StatementMeta(sparkpoolnew, 61, 34, Finished, Available, Finished)

## Configuración Base de Datos
#### NOTA: Lo ideal es que estos valores sean guardados en Key Vault

In [None]:
# Configuración JDBC
server_db = "sqlserver-stores.database.windows.net"
jdbc_url = f"jdbc:sqlserver://{server_db}:1433;database=stores-sql-db-gold"
usuario = "useruser"
password = "********"

StatementMeta(sparkpoolnew, 61, 35, Finished, Available, Finished)

In [35]:
# Guardar Tablas en Database:
gold_sales_count_by_store_year.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "gold_sales_count_by_store_year") \
    .option("user", usuario) \
    .option("password", password) \
    .mode("overwrite") \
    .save()

gold_sales_total_by_store_year.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "gold_sales_total_by_store_year") \
    .option("user", usuario) \
    .option("password", password) \
    .mode("overwrite") \
    .save()

gold_top15_products.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "gold_top15_products") \
    .option("user", usuario) \
    .option("password", password) \
    .mode("overwrite") \
    .save()

StatementMeta(sparkpoolnew, 61, 36, Finished, Available, Finished)