In [1]:
default_vals_dict = {
  "payment_type": 5,
  "RatecodeID": 1
}

green_taxi_trips_df = spark.read.format("delta").load("/mnt/datalake/green_taxi_trips")

green_taxi_trips_df = green_taxi_trips_df \
                        .filter(("passenger_count > 0 AND trip_distance > 0")) \
                        \
                        .dropna(subset=["PULocationID", "DOLocationID"]) \
                        \
                        .fillna(default_vals_dict) \
                        \
                        .drop_duplicates() \
                        \
                        .filter("lpep_pickup_datetime >= '2018-12-01' AND lpep_dropoff_datetime <= '2018-12-31'" )

In [2]:
from pyspark.sql.functions import col, year, month, dayofmonth, unix_timestamp, round, when

# Applying transformations

print("Starting transformation on green Taxi data")

green_taxi_trips_df = green_taxi_trips_df \
                             .select( \
                              col('VendorID').alias('vendor_id'), \
                              col('lpep_pickup_datetime').alias('pickup_time'), \
                              col('lpep_dropoff_datetime').alias('dropoff_time'), \
                              col('trip_distance'), \
                              col('PULocationID').alias('pickup_loc'), \
                              col('DOLocationID').alias('dropoff_loc'), \
                              col('RatecodeID').alias('rate_code_id'), \
                              col('total_amount'), \
                              col('payment_type') \
                              ) \
                              \
                             .withColumn("trip_year", year('pickup_time')) \
                             .withColumn("trip_month", month('pickup_time')) \
                             .withColumn("trip_day", dayofmonth('pickup_time')) \
                              \
                              .withColumn("trip_duration", \
                                          round((unix_timestamp('dropoff_time') - unix_timestamp('pickup_time')) / 60) \
                                         ) \
                              \
                              .withColumn( \
                                          'trip_type', \
                                                      when(
                                                      col('rate_code_id') == 6, 'shared_trip'
                                                      )\
                                                      .when(
                                                      col('rate_code_id') == 1000, 'shared_trip' # There is no rate_code_id value of 1000. Have included for "switch case" example
                                                      )
                                                      .otherwise('solo_trip')
                                         ) \
                              .drop('rate_code_id')

print("Applied transformations on green Taxi data")

In [3]:
green_taxi_trips_df.createOrReplaceGlobalTempView('fact_green_taxi_trip_data')

print("Saved green Taxi fact as a global temp view")

In [4]:
'''
print("Starting to save green Taxi dataframe as a fact and unmanaged table")

green_taxi_trips_df.write \
                  .mode('overwrite') \
                  .format('delta') \
                  .option('path', '/mnt/datalake/dimensional_model/facts/green_taxi_fact.delta') \
                  .saveAsTable('taxi_service_warehouse.fact_green_taxi_trips_data')

print("Saved green Taxi dataframe as a fact and unmanaged table")
'''

In [5]:
print("Starting to save ONLY NEW ENTRIES into Green Taxi dataframe as a fact and unmanaged table")

from delta.tables import *

existing_green_taxi_trips_delta_table = DeltaTable.forPath(spark, '/mnt/datalake/dimensional_model/facts/green_taxi_fact.delta')

existing_green_taxi_trips_delta_table.alias("existing_records").merge(
    green_taxi_trips_df.alias("new_records"),
    "existing_records.Vendor_id = new_records.Vendor_id AND existing_records.pickup_time = new_records.pickup_time") \
  .whenNotMatchedInsertAll() \
  .execute()

print("Saved green Taxi dataframe as a fact and unmanaged table")

In [6]:
dbutils.notebook.exit("Success")

Success