In [0]:
import sys
import os
import importlib
from pyspark.sql.functions import col, current_timestamp

# path for module imports
root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root not in sys.path:
    sys.path.append(root)

# Force reload to ensure latest version
if 'modules.utils.date_utils' in sys.modules:
    importlib.reload(sys.modules['modules.utils.date_utils'])
if 'modules.utils.table_utils' in sys.modules:
    importlib.reload(sys.modules['modules.utils.table_utils'])

from modules.utils.date_utils import get_month_start_n_months_ago
from modules.utils.table_utils import get_filtered_dataframe, upsert_delta_table

##### FULL LOAD AND INCREMENTAL LOAD

In [0]:
# Read data 3 months ago for incremental load
three_months_start = get_month_start_n_months_ago(3)
two_months_start = get_month_start_n_months_ago(2)

# Paths and Configuration
source_cleansed = "nyctaxi.02_silver.green_trips_cleansed"
target_enriched = "nyctaxi.02_silver.green_trips_enriched"
storage_path_enriched = "abfss://silver@stnyctaxigreen.dfs.core.windows.net/green_trips_enriched"

# Read Filtered Data
df_cleansed_trips = get_filtered_dataframe(
    spark, 
    source_table=source_cleansed, 
    target_table=target_enriched, 
    start_date=three_months_start, 
    end_date=two_months_start
)

# Read active Taxi Zones
df_zone_lookup = spark.read.table("nyctaxi.02_silver.taxi_zone_lookup").filter("end_date IS NULL")

##### ENRICHMENT JOINS

In [0]:
# Alias lookup for the Pick-up join
df_join1 = df_cleansed_trips.join(
    df_zone_lookup.alias("pu_lookup"), 
    df_cleansed_trips.pu_location_id == col("pu_lookup.location_id"), 
    "left"
).select(
    df_cleansed_trips["*"],
    col("pu_lookup.borough").alias("pu_borough"),
    col("pu_lookup.zone").alias("pu_zone")
)

# Alias lookup for Drop-off join
df_join_final = df_join1.join(
    df_zone_lookup.alias("do_lookup"), 
    df_join1.do_location_id == col("do_lookup.location_id"), 
    "left"
).select(
    df_join1.vendor,
    df_join1.lpep_pickup_datetime,
    df_join1.lpep_dropoff_datetime,
    df_join1.pu_location_id,
    df_join1.do_location_id,
    df_join1.trip_duration, 
    df_join1.passenger_count,
    df_join1.trip_distance_km,
    df_join1.rate_type,
    df_join1.pu_borough,
    col("do_lookup.borough").alias("do_borough"), 
    df_join1.pu_zone,
    col("do_lookup.zone").alias("do_zone"),       
    df_join1.payment_type,
    df_join1.fare_amount,
    df_join1.extra,
    df_join1.mta_tax,
    df_join1.tip_amount,
    df_join1.tolls_amount,
    df_join1.improvement_surcharge,
    df_join1.total_amount,
    df_join1.congestion_surcharge,
    df_join1.trip_type,  
    df_join1.cbd_congestion_fee,
    current_timestamp().alias("enriched_timestamp")
)

##### WRITING DATA EFFICIENTLY TO AVOID DUPLICATES

In [0]:
upsert_delta_table(
    spark, 
    df_join_final, 
    target_enriched, 
    "t.lpep_pickup_datetime = s.lpep_pickup_datetime AND t.pu_location_id = s.pu_location_id",
    storage_path_enriched
)

print(f"Total records in {target_enriched}: {spark.read.table(target_enriched).count()}")