In [1]:
yellow_taxi_trips_df = spark \
                      .read \
                      .option('inferSchema', 'true') \
                      .option('header', 'true') \
                      .csv('/mnt/datalake/yellow_tripdata_2018-12.csv')

In [2]:
yellow_taxi_trips_df.count()

###Writing dataframe to a deltalake to optimize processing time

In [4]:
yellow_taxi_trips_df \
.write \
.format("delta") \
.mode("overwrite") \
.save("/mnt/datalake/yellow_taxi_trips")

In [5]:
yellow_taxi_trips_df = spark.read.format("delta").load("/mnt/datalake/yellow_taxi_trips")

In [6]:
yellow_taxi_trips_df.count()

In [7]:
display(
  yellow_taxi_trips_df.describe(
  "passenger_count",
  "trip_distance")
)

summary,passenger_count,trip_distance
count,8173231.0,8173231.0
mean,1.5964102813195908,2.8926264215460558
stddev,1.233920232393633,3.764338945224816
min,0.0,0.0
max,9.0,602.3


In [8]:
print('Before filter ', yellow_taxi_trips_df.count())

yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .filter(
                               ("passenger_count > 0") and ("trip_distance > 0")
                          )

yellow_taxi_trips_df = yellow_taxi_trips_df \
                      .filter( \
                        (yellow_taxi_trips_df["passenger_count"] > 0) & (yellow_taxi_trips_df["trip_distance"] > 0) \
                        )

yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .filter(
                               ("passenger_count > 0 AND trip_distance > 0")
                          )

print('after filter ', yellow_taxi_trips_df.count())

###The keyword 'where' and 'filter' can be used interchangably. So, all the following statements are also valid

- yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .where(
                               ("passenger_count > 0") and ("trip_distance > 0")
                          )

- yellow_taxi_trips_df = yellow_taxi_trips_df \
                      .where( \
                        (yellow_taxi_trips_df["passenger_count"] > 0) & (yellow_taxi_trips_df["trip_distance"] > 0) \
                        )

- yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .where(
                               ("passenger_count > 0 AND trip_distance > 0")
                          )

- yellowTaxiTripDataDF = yellowTaxiTripDataDF
                          .where("passenger_count > 0")
                          .filter($"trip_distance" > 0.0)
                          
You can refer to columns in all the below ways

- "passenger_count > 0"

- $"passenger_count" > 0

- col("passenger_count") > 0

- yellow_taxi_trips_df["passenger_count"] > 0

In [10]:
print('Before filter ', yellow_taxi_trips_df.count())

yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .dropna(
                                subset=["PULocationID", "DOLocationID"]
                            )

print('After filter ', yellow_taxi_trips_df.count())

In [11]:
# Rate code id of yellow and green taxi refers to if the trip is a solo, shared or trip to any specific airport
# But Fhv trips data contains only 2 values, 0 and 1 (or flag) to tell us if it's a solo trip or a shared trip
# Since our goal is to merge yellow, green and fhv trips data. We convert the missing values of RateCode to 1
# See this video to understand more about data https://app.pluralsight.com/course-player?clipId=911371e0-0d10-4470-a688-9852e9440c94

display(
  yellow_taxi_trips_df.describe(
  "payment_type",
  "RatecodeID")
)

summary,payment_type,RatecodeID
count,7997713.0,7997713.0
mean,1.3206525415453143,1.046411392856933
stddev,0.4843547885442655,0.4750658059927383
min,1.0,1.0
max,4.0,99.0


In [12]:
default_vals_dict = {
  "payment_type": 5,
  "RatecodeID": 1
}

In [13]:
yellow_taxi_trips_df = yellow_taxi_trips_df.fillna(default_vals_dict)

In [14]:
print('Before filter ', yellow_taxi_trips_df.count())

yellow_taxi_trips_df = yellow_taxi_trips_df.drop_duplicates() \

print('After filter ', yellow_taxi_trips_df.count())

In [15]:
'''

print('Before filter ', yellow_taxi_trips_df.count())

yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .filter(
                                "tpep_pickup_datetime >= 2018-12-01 AND tpep_dropoff_datetime <= 2018-12-31" 
                            )

print('After filter ', yellow_taxi_trips_df.count())
'''

## Important thing to note in the above command about date range

- The above command will NOT properly filter the date range. The 'After filter ' count was 0. 
- Rather you should use single quote to enclose the date as in the below command

In [17]:
default_vals_dict = {
  "payment_type": 5,
  "RatecodeID": 1
}

print('Before filter ', yellow_taxi_trips_df.count())

yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .filter(("passenger_count > 0 AND trip_distance > 0")) \
                        \
                        .dropna(subset=["PULocationID", "DOLocationID"]) \
                        \
                        .fillna(default_vals_dict) \
                        \
                        .drop_duplicates() \
                        \
                        .filter("tpep_pickup_datetime >= '2018-12-01' AND tpep_dropoff_datetime <= '2018-12-31'" ) 

print('After filter ', yellow_taxi_trips_df.count())

###All Operations chained together (Yellow Taxi)

In [19]:
default_vals_dict = {
  "payment_type": 5,
  "RatecodeID": 1
}

yellow_taxi_trips_df = spark.read.format("delta").load("/mnt/datalake/yellow_taxi_trips")

yellow_taxi_trips_df = yellow_taxi_trips_df \
                        .filter(("passenger_count > 0 AND trip_distance > 0")) \
                        \
                        .dropna(subset=["PULocationID", "DOLocationID"]) \
                        \
                        .fillna(default_vals_dict) \
                        \
                        .drop_duplicates() \
                        \
                        .filter("tpep_pickup_datetime >= '2018-12-01' AND tpep_dropoff_datetime <= '2018-12-31'" ) \

In [20]:
from pyspark.sql.functions import col
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import unix_timestamp, round
from pyspark.sql.functions import when

# Applying transformations

print("Starting transformation on Yellow Taxi data")

yellow_taxi_trips_df = yellow_taxi_trips_df \
                             .select( \
                              col('VendorID').alias('vendor_id'), \
                              col('tpep_pickup_datetime').alias('pickup_time'), \
                              col('tpep_dropoff_datetime').alias('dropoff_time'), \
                              col('trip_distance'), \
                              col('PULocationID').alias('pickup_loc'), \
                              col('DOLocationID').alias('dropoff_loc'), \
                              col('RatecodeID').alias('rate_code_id'), \
                              col('total_amount'), \
                              col('payment_type') \
                              ) \
                              \
                             .withColumn("trip_year", year('pickup_time')) \
                             .withColumn("trip_month", month('pickup_time')) \
                             .withColumn("trip_day", dayofmonth('pickup_time')) \
                              \
                              .withColumn("trip_duration", \
                                          round((unix_timestamp('dropoff_time') - unix_timestamp('pickup_time')) / 60) \
                                         ) \
                              \
                              .withColumn( \
                                          'trip_type', \
                                                      when(
                                                      col('rate_code_id') == 6, 'shared_trip'
                                                      )\
                                                      .when(
                                                      col('rate_code_id') == 1000, 'shared_trip' # There is no rate_code_id value of 1000. Have included for "switch case" example
                                                      )
                                                      .otherwise('solo_trip')
                                         ) \
                              .drop('rate_code_id')

print("Applied transformations on Yellow Taxi data")

### Creating a global temp view for the dataframe to be accessesible in other notebooks (Currently this data is not used in other notebooks. But will use in future)

In [22]:
yellow_taxi_trips_df.createOrReplaceGlobalTempView('fact_yellow_taxi_trip_data')

print("Saved Yellow Taxi fact as a global temp view")

In [23]:
print("Starting to save Yellow Taxi dataframe as a fact and unmanaged table")

yellow_taxi_trips_df.write \
                  .mode('overwrite') \
                  .format('delta') \
                  .option('path', '/mnt/datalake/dimensional_model/facts/yellow_taxi_fact.delta') \
                  .saveAsTable('taxi_service_warehouse.fact_yellow_taxi_trips_data')

print("Saved Yellow Taxi dataframe as a fact and unmanaged table")