In [0]:
from pyspark.sql.functions import count, max, min, avg, sum, round
from dateutil.relativedelta import relativedelta
from datetime import date

In [0]:
# Get the first day of the month two months ago
two_months_ago_start = date.today().replace(day=1) - relativedelta(months=2)

In [0]:
# Load the enriched trip dataset 
# and filter to only include trips with a pickup datetime later than the start date from two months ago
df = spark.read.table("nyctaxi.02_silver.yellow_trips_enriched").filter(f"tpep_pickup_datetime > '{two_months_ago_start}'")

In [0]:
from pyspark.sql.functions import (
    col, count, avg, round, max, min, sum
)

pickup_date = col("tpep_pickup_datetime").cast("date").alias("pickup_date")

df = df.groupBy(pickup_date).agg(
    count("*").alias("total_trips"),
    round(avg("passenger_count"), 1).alias("average_passengers"),
    round(avg("trip_distance"), 1).alias("average_distance"),
    round(avg("fare_amount"), 2).alias("average_fare_per_trip"),
    max("fare_amount").alias("max_fare"),
    min("fare_amount").alias("min_fare"),
    round(sum("total_amount"), 2).alias("total_revenue")
)

In [0]:
# Write the daily summary to a Unity Catalog managed Delta table in the gold schema
df.write.mode("append").saveAsTable("nyctaxi.03_gold.daily_trip_summary")