In [1]:
%fs ls /mnt/storage

path,name,size
dbfs:/mnt/storage/FhvBases.json,FhvBases.json,464836
dbfs:/mnt/storage/fhv_tripdata_2018-12.csv,fhv_tripdata_2018-12.csv,1694806758


###Reading and tranforming Fhvtrips data

In [3]:
# Reading without specifying the schema takes long time

fhv_trips_df = spark \
              .read \
              .option('header', 'true') \
              .option('inferSchema', 'true') \
              .csv('/mnt/storage/fhv_tripdata_2018*.csv')# Read multiple files of FHV taxi data

In [4]:
# Create schema for FHV taxi data

from pyspark.sql.types import *

# Define schema for columns of Fhv trips csv file

fhv_taxi_trips_schema = StructType([
    StructField("Pickup_DateTime", TimestampType(), True),
    StructField("DropOff_datetime", TimestampType(), True),
    StructField("PUlocationID", IntegerType(), True),
    StructField("DOlocationID", IntegerType(), True),
    StructField("SR_Flag", IntegerType(), True),
    StructField("Dispatching_base_number", StringType(), True),
    StructField("Dispatching_base_num", StringType(), True)
])

In [5]:
# Apply schema to FHV taxi data

fhv_trips_df = spark \
              .read \
              .schema(fhv_taxi_trips_schema) \
              .csv('/mnt/storage/fhv_tripdata_2018-12.csv')

###Creating a new unmanaged delta table and writing dataframe to it to optimize processing time

In [7]:
fhv_trips_df.write \
            .format('delta') \
            .mode('overwrite') \
            .save('/mnt/storage/fhv_trips')

In [8]:
fhv_trips_df = spark.read.format('delta').load('/mnt/storage/fhv_trips')

In [9]:
fhv_trips_df = fhv_trips_df \
                .dropna(subset=["PULocationID", "DOLocationID"]) \
                .drop_duplicates() \
                .where("Pickup_DateTime >= '2018-12-01' AND DropOff_datetime <= '2018-12-31'")

In [10]:
# fhv_trips_df.count()

In [11]:
fhv_trips_df.printSchema()

In [12]:
# Removing columns that are redundant

fhv_trips_df = fhv_trips_df \
                .select(
                 "Pickup_DateTime",
                 "DropOff_datetime",
                 "PULocationID",
                 "DOLocationID",
                 "SR_Flag",
                 "Dispatching_base_number"
                  )

fhv_trips_df.printSchema()

Alternatively in the above command you could have done fhv_trips_df.drop("Dispatching_base_num")

In [14]:
from pyspark.sql.functions import col

fhv_trips_df = fhv_trips_df.select(
                            col("Pickup_DateTime").alias("PickupTime"), 
                            "DropOff_DateTime", 
                            "PUlocationID", 
                            "DOlocationID", 
                            "SR_Flag", 
                            "Dispatching_base_number"
                         )

fhv_trips_df.printSchema()

In [15]:
fhv_trips_df = fhv_trips_df \
                        .withColumnRenamed("DropOff_DateTime", "DropTime") \
                        .withColumnRenamed("PUlocationID", "PickupLocationId") \
                        .withColumnRenamed("DOlocationID", "DropLocationId") \
                        .withColumnRenamed("Dispatching_base_number", "BaseLicenseNumber")

In [16]:
fhv_trips_df.printSchema()

In [18]:
from pyspark.sql.functions import year, month, dayofmonth

fhv_trips_df = fhv_trips_df \
                .withColumn('TripYear', year(col("PickupTime"))) \
                .withColumn('TripMonth', month(col("PickupTime"))) \
                \
                .select(
                  '*',
                  dayofmonth(col("PickupTime")).alias('TripDay')
                )

In [19]:
from pyspark.sql.functions import unix_timestamp, round

fhv_trips_df = fhv_trips_df \
                            .withColumn("TripTimeInMinutes", 
                                        round(
                                            (unix_timestamp("DropTime") - unix_timestamp("PickupTime")) 
                                                / 60
                                        )
                               )                                               


In [20]:
from pyspark.sql.functions import when

fhv_trips_df = fhv_trips_df \
                .withColumn('TripType', 
                                     when(
                                            col("SR_Flag") == 1,
                                             "SharedTrip"
                                        )
                                    .otherwise("SoloTrip") 
                           ) \
                .drop("SR_Flag")

###Reading and transforming Fhvbases data

In [22]:
%fs head /mnt/storage/FhvBases.json

In [23]:
from pyspark.sql.types import *

# Defining a complex shema ("Address" is a complex structure)

fhv_bases_schema = StructType(
  [
    StructField("License Number", StringType(), True),
    StructField("Entity Name", StringType(), True),
    StructField("Telephone Number", LongType(), True),
    StructField("SHL Endorsed", StringType(), True),
    StructField("Type of Base", StringType(), True),
    
    StructField("Address", 
                StructType([
                    StructField("Building", StringType(), True),
                    StructField("Street", StringType(), True), 
                    StructField("City", StringType(), True), 
                    StructField("State", StringType(), True), 
                    StructField("Postcode", StringType(), True)
                ]),
                True
                ),
                
    StructField("GeoLocation", 
                StructType([
                    StructField("Latitude", StringType(), True),
                    StructField("Longitude", StringType(), True), 
                    StructField("Location", StringType(), True)
                ]),
                True
              )   
  ]
)

In [24]:
# Applying the schema defined above to fhv bases df
# Applying schema Will not throw an error. 
#  If any of the fields defined in schema is not present, the value will be set to null for that
#  If any additional columns are present in the json, they will be ignored

fhv_bases_df = spark \
                .read \
                .schema(fhv_bases_schema) \
                .option('multiline', 'true') \
                .json('/mnt/storage/FhvBases.json')

# display(fhv_bases_df)

In [25]:
fhv_bases_df = fhv_bases_df \
                .select(
                          col("License Number").alias("BaseLicenseNumber"),
                          col("Type of Base").alias("BaseType"),
                          col("Address.Building").alias("AddressBuilding"),
                          col("Address.Street").alias("AddressStreet"),
                          col("Address.City").alias("AddressCity"),
                          col("Address.State").alias("AddressState"),
                          col("Address.Postcode").alias("AddressPostCode")
                        )

###Merging two dataframes

In [27]:
fhv_trips_data_with_bases_df = fhv_trips_df \
                                          .join(
                                                fhv_bases_df,
                                                how="inner",
                                                on="BaseLicenseNumber"
                                               )

In [28]:
display(fhv_trips_data_with_bases_df)

BaseLicenseNumber,PickupTime,DropTime,PickupLocationId,DropLocationId,TripYear,TripMonth,TripDay,TripTimeInMinutes,TripType,BaseType,AddressBuilding,AddressStreet,AddressCity,AddressState,AddressPostCode
B02510,2018-12-20T11:09:58.000+0000,2018-12-20T12:11:16.000+0000,255,186,2018,12,20,61.0,SharedTrip,BLACK CAR BASE,31-00,47 AVENUE SUITE 4123A,LIC,NY,11101
B02875,2018-12-21T23:51:44.000+0000,2018-12-22T00:18:17.000+0000,114,166,2018,12,21,27.0,SoloTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-14T20:15:22.000+0000,2018-12-14T20:23:40.000+0000,42,41,2018,12,14,8.0,SoloTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-10T22:33:06.000+0000,2018-12-10T22:45:54.000+0000,237,7,2018,12,10,13.0,SharedTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-12T23:08:57.000+0000,2018-12-12T23:12:59.000+0000,33,33,2018,12,12,4.0,SoloTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-17T09:41:54.000+0000,2018-12-17T10:07:23.000+0000,178,76,2018,12,17,25.0,SoloTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-17T12:15:00.000+0000,2018-12-17T12:49:33.000+0000,61,39,2018,12,17,35.0,SharedTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-16T11:50:00.000+0000,2018-12-16T12:31:28.000+0000,72,132,2018,12,16,41.0,SharedTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-22T18:17:13.000+0000,2018-12-22T18:32:06.000+0000,17,188,2018,12,22,15.0,SoloTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001
B02875,2018-12-11T12:40:49.000+0000,2018-12-11T13:12:11.000+0000,142,114,2018,12,11,31.0,SoloTrip,BLACK CAR BASE,636,WEST 28 STREET,NEW YORK,NY,10001


In [29]:
fhv_trips_data_with_bases_df.printSchema()

###Generating Report

In [31]:
from pyspark.sql.functions import sum

# Python in built sum function won't work as expected in the following line. 
# Need the pyspark sum function which will take a column as argrument and sum the values in it

fhv_trips_report = fhv_trips_data_with_bases_df \
                    .groupBy(["AddressCity", "BaseType"]) \
                    .agg(sum("TripTimeInMinutes")) \
                    .withColumnRenamed("sum(TripTimeInMinutes)", "TotalTripTime") \
                    .orderBy(["AddressCity", "BaseType"])