### Yellow Taxi Data
Extracting, Cleaning and Loading Yellow Taxi Data

Creating Fact table and registering as Global Temporary view

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col

In [0]:
dbutils.widgets.text("ProcessMonth", "201812", "Process Month (yyyymm)")

In [0]:
processMonth = dbutils.widgets.get("ProcessMonth")

In [0]:
print("Starting to extract Yellow Taxi data")

yellowTaxiTripDataDF = spark \
                    .read \
                    .option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv("dbfs:/mnt/dataadls/YellowTaxiTripData_201812.csv")

# display(yellowTaxiTripDataDF.dtypes)

yellowTaxiTripDataDF = yellowTaxiTripDataDF \
                            .where("passenger_count > 0") \
                            .filter("trip_distance > 0.0") \
                            .na.drop(subset=("PULocationID", "DOLocationID"),how="any") \
                            .na.fill(value=5,subset=["payment_type"]) \
                            .na.fill(value=1,subset=["RatecodeID"]) \
                            .dropDuplicates()                              

print("Extracted and cleaned Yellow Taxi data")

In [0]:
# yellowTaxiTripDataDF.printSchema

In [0]:
# display(yellowTaxiTripDataDF)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2018-12-01T00:11:18.000+0000,2018-12-01T00:33:28.000+0000,1,3.72,1,N,141,90,1,16.0,0.5,0.5,2.6,0.0,0.3,19.9
1,2018-12-01T00:00:12.000+0000,2018-12-01T00:13:49.000+0000,1,2.4,1,N,162,158,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2018-12-01T00:34:47.000+0000,2018-12-01T01:08:51.000+0000,2,6.21,1,N,113,263,2,25.0,0.5,0.5,0.0,0.0,0.3,26.3
2,2018-12-01T00:37:45.000+0000,2018-12-01T00:49:17.000+0000,1,2.66,1,N,230,238,1,11.5,0.5,0.5,3.2,0.0,0.3,16.0
1,2018-12-01T00:08:23.000+0000,2018-12-01T00:11:07.000+0000,1,0.6,1,Y,262,263,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8
2,2018-12-01T00:13:39.000+0000,2018-12-01T00:25:36.000+0000,2,2.85,1,N,144,229,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3
2,2018-12-01T00:51:30.000+0000,2018-12-01T00:57:38.000+0000,1,1.15,1,N,229,140,1,6.5,0.5,0.5,1.0,0.0,0.3,8.8
2,2018-12-01T00:27:43.000+0000,2018-12-01T00:38:56.000+0000,2,2.76,1,N,148,229,1,10.5,0.5,0.5,2.0,0.0,0.3,13.8
2,2018-12-01T00:30:07.000+0000,2018-12-01T00:43:14.000+0000,1,2.63,1,N,142,107,1,11.0,0.5,0.5,1.0,0.0,0.3,13.3
1,2018-12-01T00:10:36.000+0000,2018-12-01T00:19:30.000+0000,1,1.8,1,N,181,33,1,8.5,0.5,0.5,1.95,0.0,0.3,11.75


In [0]:
print("Starting transformation on Yellow Taxi data")

# Apply transformations to Yellow taxi data
yellowTaxiTripDataDF = yellowTaxiTripDataDF \
                            .select(
                                col("VendorID"),
                                col("passenger_count").alias("PassengerCount"),
                                col("trip_distance").alias("TripDistance"),
                                col("tpep_pickup_datetime").alias("PickupTime"),                          
                                col("tpep_dropoff_datetime").alias("DropTime"), 
                                col("PUlocationID").alias("PickupLocationId"), 
                                col("DOlocationID").alias("DropLocationId"), 
                                col("RatecodeID"), 
                                col("total_amount").alias("TotalAmount"),
                                col("payment_type").alias("PaymentType")
                               ) \
                        .withColumn("TripYear", year("PickupTime")) \
                        .withColumn("TripMonth", month("PickupTime")) \
                        .withColumn("TripDay", dayofmonth("PickupTime")) \
                        .withColumn("TripTimeInMinutes", 
                                        round(
                                                (unix_timestamp("DropTime") - unix_timestamp("PickupTime")) 
                                                    / 60
                                             )
                                   ) \
                        .withColumn("TripType", 
                                        when(col("RatecodeID") == 6,"SharedTrip").otherwise("SoloTrip")
                                   ) \
                        .drop("RatecodeID")

print("Applied transformations on Yellow Taxi data")

In [0]:
yellowTaxiTripDataDF.createOrReplaceGlobalTempView("FactYellowTaxiTripData")

print("Saved Yellow Taxi fact as a global temp view")

In [0]:
print("Starting to save Yellow Taxi dataframe as a fact and unmanaged table")

# Store the DataFrame as an Unmanaged Table
yellowTaxiTripDataDF \
    .write \
    .mode("overwrite") \
    .option("path", "/mnt/datalake/DimensionalModel/Facts/YellowTaxiFact.parquet") \
    .saveAsTable("TaxiServiceWarehouse.FactYellowTaxiTripData") 

print("Saved Yellow Taxi dataframe as a fact and unmanaged table")

In [0]:
dbutils.notebook.exit("Success")