## Conexión al Storage Gen2

In [None]:
from azure.storage.filedatalake import DataLakeServiceClient
import pytz

# Configuración de acceso:
storage_account_name = "adlsstoresproject"
storage_account_key = "************"

# --- Conexión al Data Lake ---
service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net",
    credential=storage_account_key
)

# Variables
container_name = "datalake"
root_source_folder = "landing"
root_processed_folder = "processed"

StatementMeta(sparkpoolnew, 52, 5, Finished, Available, Finished)

## Identificar archivos **sales.csv** y **customers.csv**

In [38]:
file_system_client = service_client.get_file_system_client(container_name)

# --- Buscar carpetas que empiecen por 'store-' ---
store_folders = [
    path.name.split("/")[1] for path in file_system_client.get_paths(root_source_folder)
    if path.is_directory and path.name.split("/")[1].startswith("store-")
]

print(f"🔍 Tiendas encontradas: {store_folders}")

csv_sales_paths = []
csv_customers_paths = []

# --- Recorrer cada carpeta de tienda ---
for store in store_folders:
    print(f"\n📁 Archivos dentro de '{store}':")
    store_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{root_source_folder}/{store}"

    try:
        # Listar los archivos dentro de landing/store-X
        files_in_store = file_system_client.get_paths(f"{root_source_folder}/{store}")

        csv_files = [f.name.split('/')[-1] for f in files_in_store if not f.is_directory and f.name.endswith(".csv")]

        if csv_files:
            for f in csv_files:
                if (f == "customers.csv"):
                    csv_customers_paths.append(f"{store_path}/{f}")
                    print(f" Customer - {store_path}/{f}")
                
                if (f == "sales.csv"):
                    csv_sales_paths.append(f"{store_path}/{f}")
                    print(f" Sales - {store_path}/{f}")
                
        else:
            print("  (No se encontraron archivos CSV)")

    except Exception as e:
        print(f"⚠️ Error al listar archivos de '{store}': {str(e)}")

print("\n✅ Listado completo.")

StatementMeta(sparkpoolnew, 52, 6, Finished, Available, Finished)

🔍 Tiendas encontradas: ['store-a', 'store-b', 'store-c']

📁 Archivos dentro de 'store-a':
 Customer - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-a/customers.csv
 Sales - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-a/sales.csv

📁 Archivos dentro de 'store-b':
 Customer - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-b/customers.csv
 Sales - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-b/sales.csv

📁 Archivos dentro de 'store-c':
 Customer - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-c/customers.csv
 Sales - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-c/sales.csv

✅ Listado completo.


## Unir todos los archivos **sales.csv** y **customers.csv**

In [39]:
from pyspark.sql.functions import current_timestamp
from pyspark.sql.functions import lit
from datetime import datetime

# Zona horaria de Colombia
colombia_tz = pytz.timezone("America/Bogota")
timestamp_now = datetime.now(colombia_tz).strftime("%Y-%m-%d %H:%M:%S")

today = datetime.now()
year = today.strftime("%Y")
month = today.strftime("%m")
day = today.strftime("%d")

# Inicializar DataFrame vacío
df_sales = None
df_customers = None

print("📂 Archivos a procesar:")

for sale_path in csv_sales_paths:
    print(f" - {sale_path}")

    # ✅ Extraer el nombre de la store desde la ruta (por ejemplo, "store-a")
    store_name = sale_path.split("/")[-2]  # toma el penúltimo fragmento de la ruta
    
    # Leer el CSV
    df_temp = spark.read.option("header", "true").csv(sale_path)
    
    # Agregar columnas
    df_temp = (
        df_temp
            .withColumn("store", lit(store_name))
            .withColumn("created_at", lit(timestamp_now))
            .withColumn("year", lit(year))
            .withColumn("month", lit(month))
            .withColumn("day", lit(day))
            .withColumn("is_validated", lit(False))
    )
    
    # Unir con el DataFrame principal
    if df_sales is None:
        df_sales = df_temp
    else:
        df_sales = df_sales.unionByName(df_temp, allowMissingColumns=True)

if df_sales is not None:
    print(f"✅ Archivos 'sales' unificados correctamente. Total de registros: {df_sales.count()}")
    df_sales.show(5)
else:
    print("⚠️ No se encontraron archivos 'customer' para unificar.")

######################

for customer_path in csv_customers_paths:
    print(f" - {customer_path}")

    # ✅ Extraer el nombre de la store desde la ruta (por ejemplo, "store-a")
    store_name = customer_path.split("/")[-2]  # toma el penúltimo fragmento de la ruta
    
    # Leer el CSV
    df_temp = spark.read.option("header", "true").csv(customer_path)
    
    # Agregar columnas
    df_temp = (
        df_temp
            .withColumn("store", lit(store_name))
            .withColumn("created_at", lit(timestamp_now))
            .withColumn("year", lit(year))
            .withColumn("month", lit(month))
            .withColumn("day", lit(day))
            .withColumn("is_validated", lit(False))
    )
    
    # Unir con el DataFrame principal
    if df_customers is None:
        df_customers = df_temp
    else:
        df_customers = df_customers.unionByName(df_temp, allowMissingColumns=True)

if df_customers is not None:
    print(f"✅ Archivos 'customer' unificados correctamente. Total de registros: {df_customers.count()}")
    df_customers.show(5)
else:
    print("⚠️ No se encontraron archivos 'customer' para unificar.")

StatementMeta(sparkpoolnew, 52, 7, Finished, Available, Finished)

📂 Archivos a procesar:
 - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-a/sales.csv
 - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-b/sales.csv
 - abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/store-c/sales.csv
✅ Archivos 'sales' unificados correctamente. Total de registros: 47000
+-------+----------+-----------+--------+----------+------------+----------+---------+-------+-------------------+----+-----+---+------------+
|sale_id|product_id|customer_id|quantity|unit_price|total_amount| sale_date|sale_time|  store|         created_at|year|month|day|is_validated|
+-------+----------+-----------+--------+----------+------------+----------+---------+-------+-------------------+----+-----+---+------------+
|  98001|        64|         10|       3|  85766.43|   257299.29|2025-10-17| 12:28:28|store-a|2025-10-17 12:43:02|2025|   10| 17|       false|
|  98002|        28|         95|       9|  40029.52|   360265.68|2025-10-17| 07

## Guardar **df_sales** como **sales.parquet** y Crear Delta de **sales** particionado por AAAA/MM/DD

In [40]:
from pyspark.sql.functions import concat_ws
from delta.tables import DeltaTable

StatementMeta(sparkpoolnew, 52, 8, Finished, Available, Finished)

In [41]:
if df_sales is not None:
    # 📂 Rutas (parquet en carpeta, no archivo individual)
    input_path = "abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/sales/"
    output_path = f"abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/sales/"

    # 💾 Guardar el DataFrame en formato Parquet en carpeta
    df_sales.write.mode("overwrite").parquet(input_path)
    print(f"✅ Carpeta Parquet guardada en: {input_path}")

    # 🧾 Leer Parquet desde la carpeta temporal
    spark.catalog.clearCache()  # asegurarse de limpiar cache nuevamente
    df_sales = spark.read.parquet(input_path)

    # 🧠 Crear columna de llave compuesta
    df_sales = df_sales.withColumn("unique_id", concat_ws("_", "sale_id", "store"))

    # 🧹 Eliminar duplicados
    df_sales = df_sales.dropDuplicates(["unique_id"])

    # 💾 Crear carpetas si no existen
    folders = ["processed", "sales"]
    path = ""
    for f in folders:
        path = f"{path}/{f}" if path else f
        try:
            fs_client.get_directory_client(path).create_directory()
        except:
            pass  # si ya existe, ignora 

    # 🔁 MERGE en Delta para evitar duplicados
    if DeltaTable.isDeltaTable(spark, output_path):
        delta_table = DeltaTable.forPath(spark, output_path)

        delta_table.alias("target").merge(
            df_sales.alias("source"),
            "target.unique_id = source.unique_id"
        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

        print(f"✅ Merge realizado en Delta: {output_path}")
    else:
        # Si no existe la tabla Delta, se crea por primera vez
        df_sales.write.format("delta") \
            .mode("overwrite") \
            .partitionBy("year", "month", "day") \
            .save(output_path)

        print(f"✅ Tabla Delta creada en: {output_path}")

    print(f"✅ Archivo Delta de 'sales' guardado en: {output_path}")
else:
    print("⚠️ No se encontraron parquet 'sales' para convertir a Delta.")

StatementMeta(sparkpoolnew, 52, 9, Finished, Available, Finished)

✅ Carpeta Parquet guardada en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/sales/
✅ Merge realizado en Delta: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/sales/
✅ Archivo Delta de 'sales' guardado en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/sales/


## Guardar **df_customers** como **customers.parquet** y Crear Delta de **customers** particionado por AAAA/MM/DD

In [42]:
if df_customers is not None:
    # 📂 Rutas (parquet en carpeta, no archivo individual)
    input_path = "abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/customers/"
    output_path = f"abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/customers/"

    # 💾 Guardar el DataFrame en formato Parquet en carpeta
    df_customers.write.mode("overwrite").parquet(input_path)
    print(f"✅ Carpeta Parquet guardada en: {input_path}")

    # 🧾 Leer Parquet desde la carpeta temporal
    spark.catalog.clearCache()  # asegurarse de limpiar cache nuevamente
    df_customers = spark.read.parquet(input_path)

    # 🧠 Crear columna de llave compuesta
    df_customers = df_customers.withColumn("unique_id", concat_ws("_", "customer_id", "store"))

    # 🧹 Eliminar duplicados
    df_customers = df_customers.dropDuplicates(["unique_id"])

    # 💾 Crear carpetas si no existen
    folders = ["processed", "customers"]
    path = ""
    for f in folders:
        path = f"{path}/{f}" if path else f
        try:
            fs_client.get_directory_client(path).create_directory()
        except:
            pass  # si ya existe, ignora 

    # 🔁 MERGE en Delta para evitar duplicados
    if DeltaTable.isDeltaTable(spark, output_path):
        delta_table = DeltaTable.forPath(spark, output_path)

        delta_table.alias("target").merge(
            df_customers.alias("source"),
            "target.unique_id = source.unique_id"
        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

        print(f"✅ Merge realizado en Delta: {output_path}")
    else:
        # Si no existe la tabla Delta, se crea por primera vez
        df_customers.write.format("delta") \
            .mode("overwrite") \
            .partitionBy("year", "month", "day") \
            .save(output_path)

        print(f"✅ Tabla Delta creada en: {output_path}")

    print(f"✅ Archivo Delta de 'customers' guardado en: {output_path}")
else:
    print("⚠️ No se encontraron parquet 'customers' para convertir a Delta.")

StatementMeta(sparkpoolnew, 52, 10, Finished, Available, Finished)

✅ Carpeta Parquet guardada en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/customers/
✅ Merge realizado en Delta: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/customers/
✅ Archivo Delta de 'customers' guardado en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/customers/


## Identificar archivos **products.csv** y **suppliers.csv**

In [43]:
# --- Rutas relativas ---
products_folder_path = "landing/products"
products_file_name = "products.csv"
products_file_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{products_folder_path}/{products_file_name}"

suppliers_folder_path = "landing/suppliers"
suppliers_file_name = "suppliers.csv"
suppliers_file_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{suppliers_folder_path}/{suppliers_file_name}"

# --- Función corregida ---
def read_and_enrich_csv(folder_path, file_name, full_path):
    try:
        files_in_folder = file_system_client.get_paths(path=folder_path)
        found_files = [f.name.split('/')[-1] for f in files_in_folder if not f.is_directory]

        if file_name in found_files:
            print(f"📄 Archivo encontrado: {full_path}")
            df = spark.read.option("header", True).csv(full_path)

            df = (
                df.withColumn("created_at", lit(timestamp_now))
                  .withColumn("year", lit(year))
                  .withColumn("month", lit(month))
                  .withColumn("day", lit(day))
            )

            df.show(5)
            return df
        else:
            print(f"⚠️ El archivo '{file_name}' no se encontró en '{folder_path}'.")
            return None

    except Exception as e:
        print(f"❌ Error al acceder a '{folder_path}': {str(e)}")
        return None

df_products = read_and_enrich_csv(products_folder_path, products_file_name, products_file_path)
df_suppliers = read_and_enrich_csv(suppliers_folder_path, suppliers_file_name, suppliers_file_path)

StatementMeta(sparkpoolnew, 52, 11, Finished, Available, Finished)

📄 Archivo encontrado: abfss://datalake@adlsstoresproject.dfs.core.windows.net/landing/products/products.csv
+----------+--------------------+-------------+---------+-----------+-------------------+----+-----+---+
|product_id|        product_name|product_price|available|supplier_id|         created_at|year|month|day|
+----------+--------------------+-------------+---------+-----------+-------------------+----+-----+---+
|         1|Filete de Tilapia...|      30319.3|     True|         38|2025-10-17 12:43:02|2025|   10| 17|
|         2|Mantequilla Alpin...|     58827.81|    False|         25|2025-10-17 12:43:02|2025|   10| 17|
|         3|            Yuca 1kg|     57341.62|    False|          3|2025-10-17 12:43:02|2025|   10| 17|
|         4|Tomate de Árbol 500g|     15978.83|    False|         36|2025-10-17 12:43:02|2025|   10| 17|
|         5|       Jet Chocolate|     30762.35|    False|         15|2025-10-17 12:43:02|2025|   10| 17|
+----------+--------------------+-------------+-----

## Guardar **df** de products y suppliers como **Parquet** y generar archivos **Delta**

In [44]:
# --- Guardar df_products como Delta ---
if df_products is not None:
    products_parquet_path = "abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/products/"
    products_delta_path = "abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/products/"

    # 💾 Guardar en Parquet temporal
    df_products.write.mode("overwrite").parquet(products_parquet_path)
    print(f"✅ Carpeta Parquet guardada en: {products_parquet_path}")

    # 🧾 Leer Parquet desde carpeta temporal
    spark.catalog.clearCache()
    df_products = spark.read.parquet(products_parquet_path)

    # 💾 Crear carpetas si no existen
    folders = ["processed", "products"]
    path = ""
    for f in folders:
        path = f"{path}/{f}" if path else f
        try:
            fs_client.get_directory_client(path).create_directory()
        except:
            pass  # si ya existe, ignora

    # 🧠 Carga completa en Delta (sin merge)
    df_products.write.format("delta") \
        .mode("overwrite") \
        .save(products_delta_path)

    print(f"✅ Tabla Delta de 'products' creada en: {products_delta_path}")

# --- Guardar df_suppliers como Delta ---
if df_suppliers is not None:
    suppliers_parquet_path = "abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/suppliers/"
    suppliers_delta_path = "abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/suppliers/"

    # 💾 Guardar en Parquet temporal
    df_suppliers.write.mode("overwrite").parquet(suppliers_parquet_path)
    print(f"✅ Carpeta Parquet guardada en: {suppliers_parquet_path}")

    # 🧾 Leer Parquet desde carpeta temporal
    spark.catalog.clearCache()
    df_suppliers = spark.read.parquet(suppliers_parquet_path)

    # 💾 Crear carpetas si no existen
    folders = ["processed", "suppliers"]
    path = ""
    for f in folders:
        path = f"{path}/{f}" if path else f
        try:
            fs_client.get_directory_client(path).create_directory()
        except:
            pass  # si ya existe, ignora

    # 🧠 Carga completa en Delta (sin merge)
    df_suppliers.write.format("delta") \
        .mode("overwrite") \
        .save(suppliers_delta_path)

    print(f"✅ Tabla Delta de 'suppliers' creada en: {suppliers_delta_path}")

StatementMeta(sparkpoolnew, 52, 12, Finished, Available, Finished)

✅ Carpeta Parquet guardada en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/products/
✅ Tabla Delta de 'products' creada en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/products/
✅ Carpeta Parquet guardada en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/temporal_files/suppliers/
✅ Tabla Delta de 'suppliers' creada en: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/suppliers/


# Create Lake Database

In [45]:
# Nombre de la base de datos
database_name = "bronze_db"

# Ruta donde vivirá la base (carpeta processed)
database_location = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{root_processed_folder}"

# Crear la Lake Database Delta si no existe
spark.sql(f"""
CREATE DATABASE IF NOT EXISTS {database_name}
LOCATION '{database_location}'
""")

print(f"✅ Lake Database '{database_name}' creada (o ya existente)")
print(f"📁 Ubicación: {database_location}")

spark.sql("DESCRIBE DATABASE EXTENDED bronze_db").show(truncate=False)
spark.sql("SHOW DATABASES").show()

StatementMeta(sparkpoolnew, 52, 13, Finished, Available, Finished)

✅ Lake Database 'bronze_db' creada (o ya existente)
📁 Ubicación: abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed
+--------------+-----------------------------------------------------------------+
|info_name     |info_value                                                       |
+--------------+-----------------------------------------------------------------+
|Catalog Name  |spark_catalog                                                    |
|Namespace Name|bronze_db                                                        |
|Comment       |                                                                 |
|Location      |abfss://datalake@adlsstoresproject.dfs.core.windows.net/bronze_db|
|Owner         |                                                                 |
|Properties    |((IsSyMSCDMDatabase,true))                                       |
+--------------+-----------------------------------------------------------------+

+---------+
|namespace|
+---------+
| 

# Create Delta tables

In [46]:
# Eliminar tablas:
#spark.sql("DROP TABLE IF EXISTS bronze_db.customers")
#spark.sql("DROP TABLE IF EXISTS bronze_db.sales")
#spark.sql("DROP TABLE IF EXISTS bronze_db.products")
#spark.sql("DROP TABLE IF EXISTS bronze_db.suppliers")

StatementMeta(sparkpoolnew, 52, 14, Finished, Available, Finished)

## Table **bronze_db.sales**

In [47]:
# Crear la tabla dentro del Lake Database bronze_db
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS bronze_db.sales
    USING DELTA
    LOCATION 'abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/sales'
""")

print("✅ Tabla 'bronze_db.sales' registrada correctamente en el Lake Database")

StatementMeta(sparkpoolnew, 52, 15, Finished, Available, Finished)

✅ Tabla 'bronze_db.sales' registrada correctamente en el Lake Database


## Table **bronze_db.customers**

In [48]:
# Crear la tabla dentro del Lake Database bronze_db
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS bronze_db.customers
    USING DELTA
    LOCATION 'abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/customers'
""")

print("✅ Tabla 'bronze_db.customers' registrada correctamente en el Lake Database")

StatementMeta(sparkpoolnew, 52, 16, Finished, Available, Finished)

✅ Tabla 'bronze_db.customers' registrada correctamente en el Lake Database


## Table **bronze_db.products**

In [49]:
# Crear la tabla dentro del Lake Database bronze_db
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS bronze_db.products
    USING DELTA
    LOCATION 'abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/products'
""")

print("✅ Tabla 'bronze_db.products' registrada correctamente en el Lake Database")

StatementMeta(sparkpoolnew, 52, 17, Finished, Available, Finished)

✅ Tabla 'bronze_db.products' registrada correctamente en el Lake Database


## Table **bronze_db.suppliers**

In [50]:
# Crear la tabla dentro del Lake Database bronze_db
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS bronze_db.suppliers
    USING DELTA
    LOCATION 'abfss://datalake@adlsstoresproject.dfs.core.windows.net/processed/suppliers'
""")

print("✅ Tabla 'bronze_db.suppliers' registrada correctamente en el Lake Database")

StatementMeta(sparkpoolnew, 52, 18, Finished, Available, Finished)

✅ Tabla 'bronze_db.suppliers' registrada correctamente en el Lake Database
