### Green Taxi Data
Extract, transform and Load Green Taxi Data for the month

Creating facts and register as Global temp

In [0]:
dbutils.fs.ls("/mnt/dataadls/")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
dbutils.widgets.text("ProcessMonth","201812","Process Month (yyyymm)")

In [0]:
processMonth = dbutils.widgets.get("ProcessMonth")

In [0]:
print("Starting to extract Green Taxi data")

greenTaxiTripDataDF = spark \
                    .read \
                    .option("header", "true") \
                    .option("inferSchema", "true") \
                    .option("delimiter", "\t") \
                    .csv("dbfs:/mnt/dataadls/GreenTaxiTripData_201812.csv")

greenTaxiTripDataDF = greenTaxiTripDataDF \
                            .where("passenger_count > 0") \
                            .filter("trip_distance > 0.0") \
                            .na.drop(subset=("PULocationID", "DOLocationID"),how="any") \
                            .na.fill(value=5,subset=["payment_type"]) \
                            .na.fill(value=1,subset=["RatecodeID"]) \
                            .dropDuplicates()                              

print("Extracted and cleaned Green Taxi data")

In [0]:
print("Starting transformation on Green Taxi data")

# Apply transformations to Green taxi data
greenTaxiTripDataDF = greenTaxiTripDataDF \
                            .select(
                                  col("VendorID"),
                                  col("passenger_count").alias("PassengerCount"),
                                  col("trip_distance").alias("TripDistance"),
                                  col("lpep_pickup_datetime").alias("PickupTime"),                          
                                  col("lpep_dropoff_datetime").alias("DropTime"), 
                                  col("PUlocationID").alias("PickupLocationId"), 
                                  col("DOlocationID").alias("DropLocationId"), 
                                  col("RatecodeID"), 
                                  col("total_amount").alias("TotalAmount"),
                                  col("payment_type").alias("PaymentType")
                               ) \
                        .withColumn("TripYear", year("PickupTime")) \
                        .withColumn("TripMonth", month("PickupTime")) \
                        .withColumn("TripDay", dayofmonth("PickupTime")) \
                        .withColumn("TripTimeInMinutes", 
                                        round(
                                                (unix_timestamp("DropTime") - unix_timestamp("PickupTime")) 
                                                    / 60
                                             )
                                   ) \
                        .withColumn("TripType", 
                                        when(col("RatecodeID") == 6,"SharedTrip").otherwise("SoloTrip")
                                   ) \
                        .drop("RatecodeID")

print("Applied transformations on Green Taxi data")

In [0]:
greenTaxiTripDataDF.createOrReplaceGlobalTempView("FactGreenTaxiTripData")

print("Saved Green Taxi fact as a global temp view")

In [0]:
print("Starting to save Green Taxi dataframe as a fact and unmanaged table")

# Store the DataFrame as an Unmanaged Table
greenTaxiTripDataDF \
    .write \
    .mode("overwrite") \
    .option("path", "/mnt/datalake/DimensionalModel/Facts/GreenTaxiFact.parquet") \
    .saveAsTable("TaxiServiceWarehouse.FactGreenTaxiTripData") 

print("Saved Green Taxi dataframe as a fact and unmanaged table")

In [0]:
dbutils.notebook.exit("Success")