In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import isnan, when, count, col, isnull, avg, min
import pyspark.sql.functions as F
from operator import add

In [2]:
spark = SparkSession.builder \
    .master('spark://spark-master:7077') \
    .appName('W5M1') \
    .config('spark.executor.memory', '8gb') \
    .config("spark.executor.cores", "5") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/04 07:58:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
input_file_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/fhvhv_tripdata_2023-01.parquet'
output_dir_path = 'hdfs://spark-master:9000/user/spark_user/W5M1_output/'
ext='parquet'
name="TLC-2023-01"

# Data Loading
- Your application should be able to read the dataset from a specified path.
- It should handle different formats (e.g., CSV, Parquet) as specified by the user.

In [4]:
def load_data_rdd(spark_session, file_path, extension, name):
    if extension=="csv":
        data_rdd = spark_session.read.csv(file_path).rdd
    elif extension=="parquet":
        data_rdd = spark_session.read.parquet(file_path).rdd
    else:
        raise NotImpelentedError
    data_rdd.setName(name)
    return data_rdd

In [5]:
data_rdd = load_data_rdd(spark, input_file_path, ext, name)
data_rdd.take(1)

                                                                                

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B03404', originating_base_num='B03404', request_datetime=datetime.datetime(2023, 1, 1, 0, 18, 6), on_scene_datetime=datetime.datetime(2023, 1, 1, 0, 19, 24), pickup_datetime=datetime.datetime(2023, 1, 1, 0, 19, 38), dropoff_datetime=datetime.datetime(2023, 1, 1, 0, 48, 7), PULocationID=48, DOLocationID=68, trip_miles=0.94, trip_time=1709, base_passenger_fare=25.95, tolls=0.0, bcf=0.78, sales_tax=2.3, congestion_surcharge=2.75, airport_fee=0.0, tips=5.22, driver_pay=27.83, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N')]

# Data Cleaning
- The application should identify and remove any rows with missing or invalid data.

In [6]:
def remove_row_w_none_val(row):
    for val in row:
        if val is None:
            return
    return row

In [7]:
print("Before data cleaning: ", data_rdd.count())
data_rdd = data_rdd.filter(lambda row: remove_row_w_none_val(row))
print("After data cleaning: ", data_rdd.count())

                                                                                

Before data cleaning:  18479031




After data cleaning:  13587039


                                                                                

# Transformation Logic
- Implement filtering to exclude trips with zero or negative fare amounts.

In [8]:
data_rdd.take(1)

                                                                                

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B03404', originating_base_num='B03404', request_datetime=datetime.datetime(2023, 1, 1, 0, 18, 6), on_scene_datetime=datetime.datetime(2023, 1, 1, 0, 19, 24), pickup_datetime=datetime.datetime(2023, 1, 1, 0, 19, 38), dropoff_datetime=datetime.datetime(2023, 1, 1, 0, 48, 7), PULocationID=48, DOLocationID=68, trip_miles=0.94, trip_time=1709, base_passenger_fare=25.95, tolls=0.0, bcf=0.78, sales_tax=2.3, congestion_surcharge=2.75, airport_fee=0.0, tips=5.22, driver_pay=27.83, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N')]

In [9]:
def remove_non_positive_fare(row):
    if row.base_passenger_fare > 0:
        return row
    else:
        return

In [10]:
print("Before removing zero or negative fare: ", data_rdd.count())
data_rdd = data_rdd.filter(lambda row: remove_non_positive_fare(row))
print("After removing zero or negative fare: ", data_rdd.count())

                                                                                

Before removing zero or negative fare:  13587039




After removing zero or negative fare:  13573585


                                                                                

- Map the data to extract relevant columns and convert them to appropriate data types.

In [11]:
data_rdd.take(1)

                                                                                

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B03404', originating_base_num='B03404', request_datetime=datetime.datetime(2023, 1, 1, 0, 18, 6), on_scene_datetime=datetime.datetime(2023, 1, 1, 0, 19, 24), pickup_datetime=datetime.datetime(2023, 1, 1, 0, 19, 38), dropoff_datetime=datetime.datetime(2023, 1, 1, 0, 48, 7), PULocationID=48, DOLocationID=68, trip_miles=0.94, trip_time=1709, base_passenger_fare=25.95, tolls=0.0, bcf=0.78, sales_tax=2.3, congestion_surcharge=2.75, airport_fee=0.0, tips=5.22, driver_pay=27.83, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N')]

In [12]:
def extract_and_convert_relevant_columns(row):
    return Row(pickup_datetime = row.pickup_datetime.date(), trip_miles = row.trip_miles, base_passenger_fare = row.base_passenger_fare)

In [13]:
data_rdd = data_rdd.map(lambda row: extract_and_convert_relevant_columns(row))

In [14]:
data_rdd.take(1)

[Row(pickup_datetime=datetime.date(2023, 1, 1), trip_miles=0.94, base_passenger_fare=25.95)]

# Aggregation Logic
- Calculate and display the total number of trips.

In [15]:
total_number_of_trips = data_rdd.count()

                                                                                

In [16]:
print(f"total_number_of_trips: {total_number_of_trips} miles")

total_number_of_trips: 13573585 miles


- Calculate and display the total revenue generated from the trips.

In [17]:
total_revenue = data_rdd.map(lambda row: row.base_passenger_fare).reduce(add) 

                                                                                

In [18]:
print(f"total_revenue: {round(total_revenue, 2)}$")

total_revenue: 298184911.98$


- Calculate and display the average trip distance.

In [19]:
average_trip_distance = data_rdd.map(lambda row: row.trip_miles).mean()

                                                                                

In [20]:
print(f"average_trip_distance: round(average_trip_distance, 2) miles")

average_trip_distance: round(average_trip_distance, 2) miles


- Calculate and display the number of trips per day.

In [21]:
number_of_trips_per_day = data_rdd.map(lambda row: (row.pickup_datetime, 1)).reduceByKey(add).sortByKey(lambda row: row.pickup_datetime)

                                                                                

In [22]:
number_of_trips_per_day.take(20)

[(datetime.date(2023, 1, 1), 452841),
 (datetime.date(2023, 1, 2), 288847),
 (datetime.date(2023, 1, 3), 340913),
 (datetime.date(2023, 1, 4), 351779),
 (datetime.date(2023, 1, 5), 368936),
 (datetime.date(2023, 1, 6), 436676),
 (datetime.date(2023, 1, 7), 474931),
 (datetime.date(2023, 1, 8), 399608),
 (datetime.date(2023, 1, 9), 344836),
 (datetime.date(2023, 1, 10), 367237),
 (datetime.date(2023, 1, 11), 394843),
 (datetime.date(2023, 1, 12), 444287),
 (datetime.date(2023, 1, 13), 478722),
 (datetime.date(2023, 1, 14), 545906),
 (datetime.date(2023, 1, 15), 487679),
 (datetime.date(2023, 1, 16), 379179),
 (datetime.date(2023, 1, 17), 398678),
 (datetime.date(2023, 1, 18), 414715),
 (datetime.date(2023, 1, 19), 502311),
 (datetime.date(2023, 1, 20), 506785)]

- Calculate and display the total revenue per day.

In [23]:
total_revenue_per_day = data_rdd.map(lambda row: (row.pickup_datetime, row.base_passenger_fare)).reduceByKey(add).sortByKey(lambda row: row.pickup_datetime)

                                                                                

In [24]:
total_revenue_per_day.take(20)

[(datetime.date(2023, 1, 1), 13180869.880001789),
 (datetime.date(2023, 1, 2), 6566691.43000102),
 (datetime.date(2023, 1, 3), 7637673.190001559),
 (datetime.date(2023, 1, 4), 7892231.450001481),
 (datetime.date(2023, 1, 5), 8216885.190001347),
 (datetime.date(2023, 1, 6), 9354316.220002314),
 (datetime.date(2023, 1, 7), 9932890.770002726),
 (datetime.date(2023, 1, 8), 8647900.30000238),
 (datetime.date(2023, 1, 9), 7649913.430001789),
 (datetime.date(2023, 1, 10), 8134791.630001667),
 (datetime.date(2023, 1, 11), 8822176.180002036),
 (datetime.date(2023, 1, 12), 10586005.440002393),
 (datetime.date(2023, 1, 13), 10793353.590002596),
 (datetime.date(2023, 1, 14), 11416505.760003677),
 (datetime.date(2023, 1, 15), 10324326.790002713),
 (datetime.date(2023, 1, 16), 7964399.760000432),
 (datetime.date(2023, 1, 17), 8907297.920000806),
 (datetime.date(2023, 1, 18), 9323672.600000776),
 (datetime.date(2023, 1, 19), 11575510.32000106),
 (datetime.date(2023, 1, 20), 10750285.800000424)]

# Data Output
- Save the result to a persistent storage (e.g., HDFS, S3, or local file system).

In [26]:
# Save the output as text
result = spark.sparkContext.parallelize([
    f"total_number_of_trips, {total_number_of_trips}",
    f"total_revenue, {total_revenue}",
    f"average_trip_distance, {average_trip_distance}",
])
result.coalesce(1).saveAsTextFile(output_dir_path + "result.txt")

In [27]:
# Save the output as pickle object
number_of_trips_per_day.coalesce(1).saveAsPickleFile(output_dir_path + "number_of_trips_per_day")
total_revenue_per_day.coalesce(1).saveAsPickleFile(output_dir_path + "total_revenue_per_day")

                                                                                