### Creating Data Marts - Gold Layer
1. Creating data mart focusing on borough-zone analysis
1. Creating data mart focusing on payment pattern analysis
1. Creating data mart focusing on daily trends

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
%run "../include/common_functions"

In [0]:
dbutils.widgets.text("p_file_date","2025-07-01")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
v_file_date

In [0]:
year_month = v_file_date[:7]
print(year_month)

In [0]:
gold_folder_path = "abfss://gold@nyctaxitrips2025.dfs.core.windows.net/"

##### 1. Borough zone analysis

In [0]:
borough_zone_analysis_df = spark.sql(f"""
SELECT
    pickup_borough AS pickup_borough_name,
    dropoff_borough AS dropoff_borough_name,
    MONTH(pickup_datetime) AS pickup_month,
    YEAR(pickup_datetime) AS pickup_year,
    file_year_month,
    pickup_zone AS pickup_zone_name,
    dropoff_zone AS dropoff_zone_name,
    pickup_service_zone AS pickup_service_zone_name,
    dropoff_service_zone AS dropoff_service_zone_name,
    service_type AS service,

    COUNT(*) AS total_trips,

    ROUND(AVG(trip_distance), 2) AS avg_trip_distance,

    -- revenue metrics
    ROUND(SUM(total_amount), 2) AS total_revenue,
    ROUND(AVG(total_amount), 2) AS avg_total_amount,

    ROUND(SUM(fare_amount), 2) AS total_fare,
    ROUND(AVG(fare_amount), 2) AS avg_fare,

    ROUND(AVG(passenger_count), 2) AS avg_passenger_count,

    -- average trip duration in minutes
    ROUND(AVG((CAST(dropoff_datetime AS DOUBLE) - CAST(pickup_datetime AS DOUBLE)) / 60), 2) AS avg_trip_duration_mins

FROM taxi_trips_2025.gold.all_taxi_trips
WHERE file_year_month = "{year_month}"
GROUP BY
    pickup_borough,
    dropoff_borough,
    pickup_zone,
    dropoff_zone,
    pickup_service_zone,
    dropoff_service_zone,
    service_type,
    pickup_month,
    pickup_year,
    file_year_month
ORDER BY
    pickup_borough,
    dropoff_borough,
    pickup_zone,
    dropoff_zone,
    pickup_service_zone,
    dropoff_service_zone,
    service_type
""")

In [0]:
borough_zone_analysis_df = borough_zone_analysis_df.withColumn(
    "borough_zone_key",
    sha2(
        concat_ws(
            "_",
            "pickup_borough_name",
            "dropoff_borough_name",
            "pickup_zone_name",
            "dropoff_zone_name",
            "pickup_service_zone_name",
            "dropoff_service_zone_name",
            "service",
            "pickup_month",
            "pickup_year",
            "file_year_month"
        ),
        256
    )
)

In [0]:
borough_zone_analysis_df.show()

In [0]:
borough_zone_analysis_df.count()

In [0]:
# write to gold layer
merge_condition = "tgt.borough_zone_key = src.borough_zone_key AND tgt.file_year_month = src.file_year_month"
merge_delta_data(
    borough_zone_analysis_df, 
    'taxi_trips_2025', 
    'gold', 
    'borough_zone_analysis', 
    gold_folder_path, 
    merge_condition, 
    "file_year_month")

In [0]:
%sql
select * from taxi_trips_2025.gold.borough_zone_analysis;

In [0]:
%sql
select count(1) from taxi_trips_2025.gold.borough_zone_analysis

##### 2. Payment pattern analysis

In [0]:
payment_patterns_df = spark.sql(f"""
SELECT
    payment_type,
    payment_desc,
    service_type,
    pickup_borough,
    dropoff_borough,
    MONTH(pickup_datetime) AS pickup_month,
    file_year_month,
    COUNT(*) AS total_trips,

    ROUND(AVG(fare_amount), 2) AS avg_fare,
    ROUND(SUM(fare_amount), 2) AS total_fare,

    ROUND(AVG(total_amount), 2) AS avg_total_amount,
    ROUND(SUM(total_amount), 2) AS total_revenue,

    ROUND(AVG(tip_amount), 2) AS avg_tips,
    ROUND(SUM(tip_amount), 2) AS total_tips,

    ROUND(AVG(trip_distance), 2) AS avg_distance,
    ROUND(AVG(passenger_count), 2) AS avg_passenger_count

FROM taxi_trips_2025.gold.all_taxi_trips
WHERE file_year_month = "{year_month}"
GROUP BY
    payment_type,
    payment_desc,
    service_type,
    pickup_borough,
    dropoff_borough,
    file_year_month,
    pickup_month
ORDER BY
    payment_type,
    payment_desc,
    service_type,
    pickup_borough,
    dropoff_borough
""")


In [0]:
payment_patterns_df = payment_patterns_df.withColumn(
    "payment_pattern_key",
    sha2(
        concat_ws(
            "_",
            "payment_type",
            "payment_desc",
            "service_type",
            "pickup_borough",
            "dropoff_borough",
            "pickup_month"
        ),
        256
    )
)

In [0]:
payment_patterns_df.show()

In [0]:
payment_patterns_df.count()

In [0]:
# write to gold layer
merge_condition = "tgt.payment_pattern_key = src.payment_pattern_key AND tgt.file_year_month = src.file_year_month"
merge_delta_data(
    payment_patterns_df, 
    'taxi_trips_2025', 
    'gold', 
    'payment_patterns', 
    gold_folder_path, 
    merge_condition, 
    "file_year_month")

In [0]:
%sql
select * from taxi_trips_2025.gold.payment_patterns;

In [0]:
%sql
select distinct file_year_month from taxi_trips_2025.gold.payment_patterns;

##### 3. Daily Trends Analysis

In [0]:
daily_trends_df = spark.sql(f"""
SELECT
    DATE(pickup_datetime) AS pickup_date,
    MONTH(pickup_datetime) AS pickup_month,
    file_year_month,
    service_type,

    COUNT(*) AS total_trips,

    -- total journey time in seconds
    ROUND(AVG(CAST(dropoff_datetime AS DOUBLE) - CAST(pickup_datetime AS DOUBLE)), 2) AS total_seconds_journey,

    -- total journey time in minutes
    ROUND(AVG((CAST(dropoff_datetime AS DOUBLE) - CAST(pickup_datetime AS DOUBLE)) / 60.0), 2) AS total_minutes_journey,

    -- total journey time in hours
    ROUND(AVG((CAST(dropoff_datetime AS DOUBLE) - CAST(pickup_datetime AS DOUBLE)) / 3600.0), 2) AS total_hours_journey,

    ROUND(AVG(trip_distance), 2) AS avg_distance,
    ROUND(AVG(passenger_count), 2) AS avg_passengers

FROM taxi_trips_2025.gold.all_taxi_trips
WHERE file_year_month = "{year_month}"
GROUP BY
    DATE(pickup_datetime),
    MONTH(pickup_datetime),
    service_type,
    file_year_month

ORDER BY pickup_date, service_type
""")


In [0]:
daily_trends_df.show()

In [0]:
daily_trends_df.count()

In [0]:
# write to gold layer
merge_condition = "tgt.service_type = src.service_type AND tgt.file_year_month = src.file_year_month AND tgt.pickup_date = src.pickup_date"
merge_delta_data(
    daily_trends_df, 
    'taxi_trips_2025', 
    'gold', 
    'daily_trends', 
    gold_folder_path, 
    merge_condition, 
    "file_year_month")

In [0]:
%sql
select count(1) from taxi_trips_2025.gold.daily_trends;

In [0]:
dbutils.notebook.exit("Success")