In [41]:
from pyspark.sql import SparkSession
from datetime import datetime
import pandas as pd

In [42]:
spark = (
    SparkSession.builder.appName("TLC-Analyze")
    .master("local[*]")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.port", "7077")
    .config("spark.ui.port", "4050")
    .config("spark.driver.memory", "15g")
    .config("spark.executor.memory", "15g")
    .getOrCreate()
)

In [43]:
def read_data(file_name, date):
    file_path = f"your path"
    print(f"Attempting to read from: {file_path}")

    try:
        read_df = spark.read.parquet(file_path)
        return read_df
    except Exception:
        print(f"File not found: {file_path}")
        return None


start_date = "2024-01-01"
start_date = datetime.strptime(start_date, "%Y-%m-%d")

end_date = pd.Timestamp(start_date) + pd.offsets.MonthEnd(1)
end_date_str = end_date.strftime("%Y-%m-%d")
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

In [44]:
def clean_data(row):
    try:
        (
            tpep_pickup_datetime,
            tpep_dropoff_datetime,
            trip_distance,
            total_amount,
            congestion_surcharge,
            airport_fee,
        ) = row

        if any(
            [
                tpep_pickup_datetime is None,
                tpep_dropoff_datetime is None,
                trip_distance is None,
                total_amount is None,
                congestion_surcharge is None,
                airport_fee is None,
            ]
        ):
            return False

        if isinstance(tpep_pickup_datetime, str):
            pickup_datetime = datetime.strptime(
                tpep_pickup_datetime, "%Y-%m-%d %H:%M:%S"
            )
        else:
            pickup_datetime = tpep_pickup_datetime

        if isinstance(tpep_dropoff_datetime, str):
            dropoff_datetime = datetime.strptime(
                tpep_dropoff_datetime, "%Y-%m-%d %H:%M:%S"
            )
        else:
            dropoff_datetime = tpep_dropoff_datetime

        if (
            pickup_datetime < start_date
            or pickup_datetime > end_date
            or dropoff_datetime < start_date
            or dropoff_datetime > end_date
        ):
            return False

        if (
            trip_distance < 0
            or total_amount < 0
            or congestion_surcharge < 0
            or airport_fee < 0
        ):
            return False

        return True

    except Exception as e:
        print(f"Error processing row {row}: {e}")
        return False

In [45]:
categories = ["yellow"]

months = (
    pd.date_range(start="2024-01-01", end="2024-02-01", freq="MS")
    .strftime("%Y-%m")
    .tolist()
)

df = None
for category in categories:
    for month in months:
        print(">>>>>", category, month)
        read_df = read_data(category, month)
        if read_df is not None:
            if df is None:
                df = read_df
            else:
                df = df.union(read_df)

>>>>> yellow 2024-01
Attempting to read from: /Users/munsoyun/Desktop/Docker/W5M1/yellow_tripdata_2024-01.parquet
>>>>> yellow 2024-02
Attempting to read from: /Users/munsoyun/Desktop/Docker/W5M1/yellow_tripdata_2024-01.parquet


In [46]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [47]:
rdd = df.rdd.map(
    lambda row: (
        row.tpep_pickup_datetime,
        row.tpep_dropoff_datetime,
        row.trip_distance,
        row.total_amount,
        row.congestion_surcharge,
        row.Airport_fee,
    )
)

In [48]:
filtered_rdd = rdd.filter(clean_data)

In [49]:
filtered_rdd.cache()

PythonRDD[13] at RDD at PythonRDD.scala:53

### 총 여행 횟수

In [50]:
print(f"Total trip count sum: {filtered_rdd.count()}")



Total trip count sum: 5381996


                                                                                

### 총 요금 합계 

In [51]:
def calculate_totals(row):
    _, _, _, total_amount, congestion_surcharge, airport_fee = row
    return total_amount + congestion_surcharge + airport_fee


total_fee_sum = filtered_rdd.map(calculate_totals).reduce(lambda x, y: x + y)
print(f"Total fee sum: {total_fee_sum}")



Total fee sum: 161401645.26020595


                                                                                

### 평균 여행 거리

In [52]:
trip_distances = filtered_rdd.map(lambda row: row[2])

total_distance = trip_distances.sum()
count = trip_distances.count()

average_trip_distance = total_distance / count if count > 0 else 0

print(f"Average Trip Distance: {average_trip_distance}")



Average Trip Distance: 3.2671553750690645


                                                                                

### 일별 여행 횟수 및 수익

In [53]:
date_trip_cost_pairs = filtered_rdd.map(
    lambda row: (
        row[0].strftime("%Y-%m-%d"),
        (1, row[3]),
    )
)

date_grouped = date_trip_cost_pairs.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)

date_sorted = date_grouped.map(
    lambda x: (datetime.strptime(x[0], "%Y-%m-%d"), x[1])
).sortByKey()

grouped_rdd = date_sorted.map(lambda x: (x[0].strftime("%Y-%m-%d"), x[1]))

for date, (total_trips, total_cost) in grouped_rdd.collect():
    print(f"Date: {date}, Total Trips: {total_trips}, Total Cost: {total_cost}")



Date: 2024-01-01, Total Trips: 138580, Total Cost: 4351241.419999732
Date: 2024-01-02, Total Trips: 144550, Total Cost: 4496662.559999745
Date: 2024-01-03, Total Trips: 158684, Total Cost: 4660465.999999845
Date: 2024-01-04, Total Trips: 198036, Total Cost: 5520606.139999841
Date: 2024-01-05, Total Trips: 197592, Total Cost: 5352975.639999876
Date: 2024-01-06, Total Trips: 181404, Total Cost: 4686288.339999873
Date: 2024-01-07, Total Trips: 128000, Total Cost: 3723312.079999747
Date: 2024-01-08, Total Trips: 153778, Total Cost: 4369534.879999866
Date: 2024-01-09, Total Trips: 171184, Total Cost: 4436348.71999997
Date: 2024-01-10, Total Trips: 182018, Total Cost: 4987834.799999964
Date: 2024-01-11, Total Trips: 201012, Total Cost: 5680665.499999807
Date: 2024-01-12, Total Trips: 196144, Total Cost: 5545994.519999802
Date: 2024-01-13, Total Trips: 197626, Total Cost: 5118524.299999842
Date: 2024-01-14, Total Trips: 175590, Total Cost: 4721627.499999809
Date: 2024-01-15, Total Trips: 1465

                                                                                

In [54]:
spark.stop()