In [0]:
import sys
import os
import importlib
from pyspark.sql.functions import count, max, min, avg, sum, round, col, current_timestamp

# path for module imports
root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root not in sys.path:
    sys.path.append(root)

# Force reload to ensure latest version
if 'modules.utils.date_utils' in sys.modules:
    importlib.reload(sys.modules['modules.utils.date_utils'])
if 'modules.utils.table_utils' in sys.modules:
    importlib.reload(sys.modules['modules.utils.table_utils'])

from modules.utils.date_utils import get_month_start_n_months_ago
from modules.utils.table_utils import get_filtered_dataframe, upsert_delta_table

##### FULL LOAD AND INCREMENTAL LOAD

In [0]:
# Read data 3 months ago for incremental load
three_months_start = get_month_start_n_months_ago(3)
two_months_start = get_month_start_n_months_ago(2)

# Paths and Configuration - Table is Managed
source_enriched = "nyctaxi.02_silver.green_trips_enriched"
target_gold = "nyctaxi.03_gold.daily_trip_summary"

# Read Filtered Data    
df_enriched = get_filtered_dataframe(
    spark, 
    source_table=source_enriched, 
    target_table=target_gold, 
    start_date=get_month_start_n_months_ago(3), 
    end_date=get_month_start_n_months_ago(2),
    date_col="lpep_pickup_datetime"
)

##### AGGREGATION

In [0]:
df_daily = df_enriched.\
        groupBy(col("lpep_pickup_datetime").cast("date").alias("pickup_date")).\
        agg(
            count("*").alias("total_trips"),                             
            round(avg("trip_distance_km"), 1).alias("average_distance"),     
            round(avg("passenger_count"), 1).alias("average_passengers"), 
            round(avg("fare_amount"), 2).alias("average_fare_per_trip"),   
            max("fare_amount").alias("max_fare"),                         
            min("fare_amount").alias("min_fare"),                         
            round(sum("total_amount"), 2).alias("total_revenue")          
        ).withColumn("processed_timestamp", current_timestamp())

##### WRITING DATA EFFICIENTLY TO AVOID DUPLICATES

In [0]:
upsert_delta_table(
    spark, 
    df_daily, 
    target_gold, 
    "t.pickup_date = s.pickup_date",
    storage_path=None 
)

print(f"Total records in {target_gold}: {spark.read.table(target_gold).count()}")