In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, to_date, input_file_name

load_dotenv("/home/jovyan/work/.env")

spark = SparkSession.builder.appName("csv_to_parquet_bronze_customer").getOrCreate()

In [2]:
raw_path = os.getenv("RAW_PATH")
bronze_path = os.getenv("BRONZE_PATH")

In [None]:
# Path to your CSV file
csv_path = os.path.join(raw_path, "customers.csv")

print("Ruta final:", csv_path)

# 1) Validar existencia del archivo antes de leer
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"[ERROR] No se encontr√≥ el archivo CSV en la ruta: {csv_path}")

# Leer csv como un dataframe
df_raw = spark.read.csv(csv_path, header=True, inferSchema=True)

Ruta final: /home/jovyan/work/data/raw/customers.csv


In [4]:
# Agregar columnas de control
df_bronze = (
    df_raw
    .withColumn("ingestion_timestamp",   current_timestamp())       # timestamp exacto
    .withColumn("source_file",    input_file_name())         # ruta del archivo origen
)

In [5]:
# Escritura a bronze como parquet
parquet_path = os.path.join(bronze_path, "customer")  

df_bronze.write.parquet(parquet_path, mode= 'overwrite')

print(f"Bronze escrito en: {parquet_path}")

Bronze escrito en: /home/jovyan/work/data/bronze/customer
