##FHV Taxi Data

Extract, clean, transform and load Fhv Taxi trip data for a month.

Create fact, load as an unmanaged table, as well as register as a global temp view.

In [3]:
process_month = getArgument("ProcessMonth") #dbutils.widgets.get("ProcessMonth")

In [4]:
from pyspark.sql.functions import *

In [5]:
# Extract and clean data
print("Starting to extract Fhv Taxi data")


fhv_trips_df = spark.read \
                .option('header', 'true') \
                .option('inferSchema', 'true') \
                .csv('/mnt/storage/fhv_tripdata_{}.csv'.format(process_month))

fhv_trips_df = fhv_trips_df \
                .na.drop(how='any',subset=('PUlocationID', 'DOlocationID')) \
                .drop_duplicates() \
                .where( (col("Pickup_DateTime") > '2018-11-31') & (col('DropOff_datetime') < '2019-01-01') )


print("Extracted and cleaned Fhv Taxi data")

In [6]:
# Applying transformations

print("Starting transformation on Fhv Taxi data")


fhv_trips_df = fhv_trips_df.select( \
                                  col('Pickup_DateTime').alias('pickup_date_time'),
                                  col('DropOff_datetime').alias('dropoff_date_time'),
                                  col('PUlocationID').alias('pickup_location'),
                                  col('DOlocationID').alias('dropoff_location'),
                                  col('SR_Flag').alias('sr_flag'),
                                  col('Dispatching_base_number').alias('dispatching_base_number'),
                           ) \
                            \
                            .withColumn('pickup_year', year('pickup_date_time')) \
                            .withColumn('pickup_month', month('pickup_date_time')) \
                            \
                            \
                            .select(
                                    '*',
                                    dayofmonth('pickup_date_time').alias('pickup_day'),
                                    round((unix_timestamp('dropoff_date_time') - unix_timestamp('pickup_date_time')) / 60).alias('trip_duration')
                            )\
                            .withColumn('trip_type',
                                       when(col('sr_flag') == 1, 'shared_trip') \
                                       .when(col('sr_flag') == 0, 'solo_trip') \
                                       .otherwise('solo_trip')
                                      ) \
                            .drop('sr_flag') \


print("Applied transformations on Fhv Taxi data")

In [7]:
fhv_trips_df.createOrReplaceGlobalTempView('fact_fhv_taxi_trip_data')

print("Saved Fhv Taxi fact as a global temp view")

In [8]:
print("Starting to save Fhv Taxi dataframe as a fact and unmanaged table")

fhv_trips_df.write \
                  .mode('append') \
                  .option('path', '/mnt/datalake/dimensional_model/facts/fhv_taxi_fact.parquet') \
                  .saveAsTable('taxi_service_warehouse.fact_fhv_taxi_trips_data')

print("Saved Fhv Taxi dataframe as a fact and unmanaged table")

In [9]:
dbutils.notebook.exit("Success")