##Yellow Taxi Data

Extract, clean, transform and load Yellow Taxi trip data for a month.

Create fact, load as an unmanaged table, as well as register as a global temp view.

In [2]:
dbutils.widgets.text("ProcessMonth", "201812", "Process Month (yyyymm)")

In [3]:
process_month = getArgument("ProcessMonth") #dbutils.widgets.get("ProcessMonth")

In [4]:
from pyspark.sql.functions import year, month, dayofmonth, unix_timestamp, round, when, col

In [5]:
dbutils.fs.ls('/mnt/datalake/')

In [6]:
# Extract and clean data
print("Starting to extract Yellow Taxi data")

yellow_taxi_trip_df = spark \
    .read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/mnt/datalake/yellow_tripdata_{}.csv".format(process_month))

default_values_dict = {
  "payment_type": 5,
  "RateCodeId": 1
}

filtered_yellow_taxi_trips = yellow_taxi_trip_df.where((yellow_taxi_trip_df['trip_distance'] > 0) & (yellow_taxi_trip_df['passenger_count'] > 0)) \
                                                .dropna(subset=("PULocationID", "DOLocationID")) \
                                                .na.fill(default_values_dict) \
                                                .drop_duplicates() \
                                                .where("tpep_pickup_datetime >= '2018-12-01' AND tpep_dropoff_datetime < '2019-01-01'")

print("Extracted and cleaned Yellow Taxi data")

In [7]:
# Applying transformations

print("Starting transformation on Yellow Taxi data")

filtered_yellow_taxi_trips = filtered_yellow_taxi_trips \
                             .select( \
                              col('VendorID').alias('vendor_id'), \
                              col('tpep_pickup_datetime').alias('pickup_time'), \
                              col('tpep_dropoff_datetime').alias('dropoff_time'), \
                              col('trip_distance'), \
                              col('PULocationID').alias('pickup_loc'), \
                              col('DOLocationID').alias('dropoff_loc'), \
                              col('RatecodeID').alias('rate_code_id'), \
                              col('total_amount'), \
                              col('payment_type') \
                              ) \
                              \
                             .withColumn("trip_year", year('pickup_time')) \
                             .withColumn("trip_month", month('pickup_time')) \
                             .withColumn("trip_day", dayofmonth('pickup_time')) \
                              \
                              .withColumn("trip_duration", \
                                          round((unix_timestamp('dropoff_time') - unix_timestamp('pickup_time')) / 60) \
                                         ) \
                              \
                              .withColumn( \
                                          'trip_type', \
                                                      when(
                                                      col('rate_code_id') == 6, 'shared_trip'
                                                      )\
                                                      .when(
                                                      col('rate_code_id') == 1000, 'shared_trip' # There is no rate_code_id value of 1000. Have included for "switch case" example
                                                      )
                                                      .otherwise('solo_trip')
                                         ) \
                              .drop('rate_code_id')

print("Applied transformations on Yellow Taxi data")

In [8]:
filtered_yellow_taxi_trips.createOrReplaceGlobalTempView('fact_yellow_taxi_trip_data')

print("Saved Yellow Taxi fact as a global temp view")

In [9]:
print("Starting to save Yellow Taxi dataframe as a fact and unmanaged table")

yellow_taxi_trip_df.write \
                  .mode('append') \
                  .option('path', '/mnt/datalake/dimensional_model/facts/yellow_taxi_fact.parquet') \
                  .saveAsTable('taxi_service_warehouse.fact_yellow_taxi_trips_data')

print("Saved Yellow Taxi dataframe as a fact and unmanaged table")

In [10]:
dbutils.notebook.exit("Success")

Success