##Green Taxi Data

Extract, clean, transform and load Green Taxi trip data for a month.

Create fact, load as an unmanaged table, as well as register as a global temp view.

In [2]:
dbutils.widgets.text("ProcessMonth", "201901", "Process Month (yyyymm)")

In [3]:
print()

In [4]:
process_month = getArgument("ProcessMonth") #dbutils.widgets.get("ProcessMonth")

In [5]:
print(process_month)

In [6]:
from pyspark.sql.functions import year, month, dayofmonth, unix_timestamp, round, when, col

In [7]:
print("Starting to extract Green Taxi data")

green_taxi_trip_df = spark \
    .read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("delimiter", "\t") \
    .csv("/mnt/datalake/green_tripdata_{}.csv".format(process_month))

default_values_dict = {
  "payment_type": 5,
  "RateCodeId": 1
}

filtered_green_taxi_trips = green_taxi_trip_df.where((green_taxi_trip_df['trip_distance'] > 0) & (green_taxi_trip_df['passenger_count'] > 0)) \
                                                .dropna(subset=("PULocationID", "DOLocationID")) \
                                                .na.fill(default_values_dict) \
                                                .drop_duplicates() \
                                                .where("lpep_pickup_datetime >= '2018-12-01' AND lpep_dropoff_datetime < '2019-01-01'")

print("Extracted and cleaned Green Taxi data")

In [8]:
# Applying transformations

print("Starting transformation on Green Taxi data")

filtered_green_taxi_trips = filtered_green_taxi_trips \
                             .select( \
                              col('VendorID').alias('vendor_id'), \
                              col('lpep_pickup_datetime').alias('pickup_time'), \
                              col('lpep_dropoff_datetime').alias('dropoff_time'), \
                              col('trip_distance'), \
                              col('PULocationID').alias('pickup_loc'), \
                              col('DOLocationID').alias('dropoff_loc'), \
                              col('RatecodeID').alias('rate_code_id'), \
                              col('total_amount'), \
                              col('payment_type') \
                              ) \
                              \
                             .withColumn("trip_year", year('pickup_time')) \
                             .withColumn("trip_month", month('pickup_time')) \
                             .withColumn("trip_day", dayofmonth('pickup_time')) \
                              \
                              .withColumn("trip_duration", \
                                          round((unix_timestamp('dropoff_time') - unix_timestamp('pickup_time')) / 60) \
                                         ) \
                              \
                              .withColumn( \
                                          'trip_type', \
                                                      when(
                                                      col('rate_code_id') == 6, 'shared_trip'
                                                      )\
                                                      .when(
                                                      col('rate_code_id') == 1000, 'shared_trip' # There is no rate_code_id value of 1000. Have included for "switch case" example
                                                      )
                                                      .otherwise('solo_trip')
                                         ) \
                              .drop('rate_code_id')

print("Applied transformations on Green Taxi data")

In [9]:
filtered_green_taxi_trips.createOrReplaceGlobalTempView('fact_green_taxi_trip_data')

print("Saved Green Taxi fact as a global temp view")

In [10]:
print("Starting to save Green Taxi dataframe as a fact and unmanaged table")

green_taxi_trip_df.write \
                  .mode('append') \
                  .option('path', '/mnt/datalake/dimensional_model/facts/green_taxi_fact.parquet') \
                  .saveAsTable('taxi_service_warehouse.fact_green_taxi_trips_data')

print("Saved Green Taxi dataframe as a fact and unmanaged table")

In [11]:
dbutils.notebook.exit("Success")