## Read File

In [43]:
from pyspark.sql import SparkSession
import os
from operator import add


def read_file_as_rdd(file_path):
    file_extension = os.path.splitext(file_path)[1]
    if file_extension == ".csv":
        df = spark.read.csv(file_path, header=True, inferSchema=True)
    elif file_extension == ".parquet":
        df = spark.read.parquet(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")
    return df.rdd


spark = SparkSession.builder.master("local[*]").getOrCreate()
file_path = "./yellow_tripdata.parquet"
rdd = read_file_as_rdd(file_path)

rdd.take(1)

[Row(VendorID=1, tpep_pickup_datetime=datetime.datetime(2024, 5, 1, 0, 59, 15), tpep_dropoff_datetime=datetime.datetime(2024, 5, 1, 1, 23, 50), passenger_count=1, trip_distance=6.1, RatecodeID=1, store_and_fwd_flag='N', PULocationID=138, DOLocationID=145, payment_type=1, fare_amount=28.2, extra=7.75, mta_tax=0.5, tip_amount=5.0, tolls_amount=0.0, improvement_surcharge=1.0, total_amount=42.45, congestion_surcharge=0.0, Airport_fee=1.75)]

## Data Cleaning

In [44]:
# # Apply the cleaning function
rdd.filter(lambda row: row.passenger_count is not None).filter(lambda row: row.passenger_count > 0).filter(
    lambda row: row.tpep_pickup_datetime > row.tpep_dropoff_datetime).filter(lambda row: row.trip_distance >= 0)

trip_num = rdd.count()

## Calculate and display the total revenue generated from the trips.

In [45]:
total_revenue = rdd.map(lambda row: row.total_amount).reduce(add)
total_revenue

105662803.20010114

## Calculate and display the average trip distance.

In [46]:
trip_distance_rdd = rdd.map(lambda row: row.trip_distance)

count = trip_distance_rdd.count()
total = trip_distance_rdd.sum()
average_trip_distance = total / count

## Calculate and display the number of trips per day.

In [47]:
trips_per_day = rdd.map(lambda row: (row.tpep_pickup_datetime.date(), 1)).reduceByKey(
    lambda a, b: a + b).sortByKey().collect()
for date, count in trips_per_day:
    print(f"{date} : {count}")

2002-12-31 : 3
2008-12-31 : 1
2009-01-01 : 3
2024-04-30 : 13
2024-05-01 : 121928
2024-05-02 : 133110
2024-05-03 : 128046
2024-05-04 : 133764
2024-05-05 : 120116
2024-05-06 : 102937
2024-05-07 : 119368
2024-05-08 : 127738
2024-05-09 : 136618
2024-05-10 : 138324
2024-05-11 : 131001
2024-05-12 : 112607
2024-05-13 : 111384
2024-05-14 : 131795
2024-05-15 : 138562
2024-05-16 : 143801
2024-05-17 : 133268
2024-05-18 : 138069
2024-05-19 : 114559
2024-05-20 : 106889
2024-05-21 : 121481
2024-05-22 : 130889
2024-05-23 : 122594
2024-05-24 : 109756
2024-05-25 : 90098
2024-05-26 : 85651
2024-05-27 : 72579
2024-05-28 : 104696
2024-05-29 : 121234
2024-05-30 : 123711
2024-05-31 : 117227
2024-06-01 : 13


## Calculate and display the total revenue per day.

In [48]:
total_revenue_per_day = rdd.map(lambda row: (row.tpep_pickup_datetime.date(), row.total_amount)).reduceByKey(
    lambda a, b: a + b).sortByKey().collect()
for date, total_revenue in total_revenue_per_day:
    print(f"{date} : {total_revenue}")

2002-12-31 : 202.95
2008-12-31 : 83.0
2009-01-01 : 155.22
2024-04-30 : 428.68
2024-05-01 : 3481907.0999998385
2024-05-02 : 3805209.8999997946
2024-05-03 : 3573393.26999984
2024-05-04 : 3412692.2799998536
2024-05-05 : 3365959.059999837
2024-05-06 : 3105151.3999997997
2024-05-07 : 3391454.839999858
2024-05-08 : 3661203.939999871
2024-05-09 : 3945094.7999998135
2024-05-10 : 3866097.7699999185
2024-05-11 : 3394738.189999857
2024-05-12 : 3136695.029999801
2024-05-13 : 3307487.6999997674
2024-05-14 : 3835249.379999875
2024-05-15 : 4067753.279999866
2024-05-16 : 4184016.1399997533
2024-05-17 : 3767827.5599998073
2024-05-18 : 3654415.0899998425
2024-05-19 : 3640467.9599998943
2024-05-20 : 3110160.2999998177
2024-05-21 : 3425682.749999858
2024-05-22 : 3724549.1799998228
2024-05-23 : 3521902.939999827
2024-05-24 : 3133515.0099998526
2024-05-25 : 2359094.25999991
2024-05-26 : 2317680.139999904
2024-05-27 : 2116519.459999892
2024-05-28 : 3097452.5099998014
2024-05-29 : 3425639.439999823
2024-05-30

## Save as File

In [49]:
result_rdd = spark.sparkContext.parallelize([
    f"trip_num= {trip_num}",
    f"total_revenue= {total_revenue})",
    f"trip_distance_rdd= {trip_distance_rdd}",
    f"trips_per_day= {trips_per_day}",
    f"total_revenue_per_day= {total_revenue_per_day}"
])

# 텍스트 파일로 저장
result_rdd.repartition(1).saveAsTextFile("./result.txt")

#TODO :Use appropriate Spark configurations to optimize the job performance.
#TODO :Utilize Spark's built-in functions and capabilities to reduce the job execution time.