In [1]:
fhv_trips_df = spark.read.format('delta').load('/mnt/storage/fhv_trips')

In [2]:
from pyspark.sql.functions import col, year, month, dayofmonth, unix_timestamp, round, when

fhv_trips_df = fhv_trips_df \
                .dropna(subset=["PULocationID", "DOLocationID"]) \
                .drop_duplicates() \
                .where("Pickup_DateTime >= '2018-12-01' AND DropOff_datetime <= '2018-12-31'") \
                \
                .select(
                            col("Pickup_DateTime").alias("PickupTime"), 
                            "DropOff_DateTime", 
                            "PUlocationID", 
                            "DOlocationID", 
                            "SR_Flag", 
                            "Dispatching_base_number"
                         ) \
                \
                .withColumnRenamed("DropOff_DateTime", "DropTime") \
                .withColumnRenamed("PUlocationID", "PickupLocationId") \
                .withColumnRenamed("DOlocationID", "DropLocationId") \
                .withColumnRenamed("Dispatching_base_number", "BaseLicenseNumber") \
                \
                .withColumn('TripYear', year(col("PickupTime"))) \
                .withColumn('TripMonth', month(col("PickupTime"))) \
                .select(
                  '*',
                  dayofmonth(col("PickupTime")).alias('TripDay')
                ) \
                \
                .withColumn("TripTimeInMinutes", 
                          round(
                              (unix_timestamp("DropTime") - unix_timestamp("PickupTime")) 
                                  / 60
                          )
                 ) \
                .withColumn('TripType', 
                                     when(
                                            col("SR_Flag") == 1,
                                             "SharedTrip"
                                        )
                                    .otherwise("SoloTrip") 
                           ) \
                .drop("SR_Flag")



In [3]:
fhv_trips_df.createOrReplaceGlobalTempView('fact_fhv_taxi_trip_data')

print("Saved fhv Taxi fact as a global temp view")

In [4]:
print("Starting to save fhv Taxi dataframe as a fact and unmanaged table")

fhv_trips_df.write \
                  .mode('overwrite') \
                  .format('delta') \
                  .option('path', '/mnt/datalake/dimensional_model/facts/fhv_taxi_fact.delta') \
                  .saveAsTable('taxi_service_warehouse.fact_fhv_taxi_trips_data')

print("Saved fhv Taxi dataframe as a fact and unmanaged table")

In [5]:
dbutils.notebook.exit("Success")