In [0]:
import pyspark.sql.functions as F

In [0]:
dbutils.widgets.text("dataset_prefix", "")
dbutils.widgets.text("month", "")

In [0]:
s3_base = "s3a://ifood-case-open/raw/tlc"
dataset_prefix = dbutils.widgets.get("dataset_prefix") #"yellow_tripdata"
month = dbutils.widgets.get("month") # "YYYY-MM"
target_table = f"workspace.ifood_bronze.{dataset_prefix}"

In [0]:
current_bronze = spark.read.table(target_table)
if current_bronze.where(F.col("source_month") == month).count() > 0:
    raise Exception(f"Data already loaded for {month}")

In [0]:
path = f"{s3_base}/{dataset_prefix}_{month}.parquet"
df_raw = spark.read.parquet(path)

In [0]:
df_cast = (df_raw
    .withColumn("VendorID",        F.col("VendorID").cast("int"))
    .withColumn("passenger_count", F.col("passenger_count").cast("int"))
    .withColumn("RatecodeID",      F.col("RatecodeID").cast("int"))
    .withColumn("PULocationID",    F.col("PULocationID").cast("int"))
    .withColumn("DOLocationID",    F.col("DOLocationID").cast("int"))
    .withColumn("payment_type",    F.col("payment_type").cast("int"))
    .withColumn("tpep_pickup_datetime",  F.col("tpep_pickup_datetime").cast("timestamp"))
    .withColumn("tpep_dropoff_datetime", F.col("tpep_dropoff_datetime").cast("timestamp"))
)

if "airport_fee" in df_raw.columns:
    df_cast = df_cast.withColumnRenamed("airport_fee", "Airport_fee")


In [0]:
df_bronze = (df_cast
             .withColumn("source_prefix", F.lit(dataset_prefix))
             .withColumn("source_month", F.lit(month))
)

In [0]:
df_bronze.write.format("delta").mode("append").saveAsTable(target_table)