In [2]:
from pyspark.sql import SparkSession
from datetime import datetime
import pandas as pd

In [3]:
spark = (
    SparkSession.builder.appName("TLC DataSet Analyze")
    .master("local[*]")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.port", "7077")
    .config("spark.ui.port", "4050")
    .config("spark.driver.memory", "15g")
    .config("spark.executor.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/04 16:05:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
def read_data(file_name, date):
    file_path = f"/Users/admin/Desktop/docker/W5M1/parquet_files/{file_name}_tripdata_{date}.parquet"  # 로컬
    print(f"Attempting to read from: {file_path}")

    try:
        read_df = spark.read.parquet(file_path)
        return read_df
    except Exception:
        print(f"File not found: {file_path}")
        return None


start_date = "2022-05-01"
start_date = datetime.strptime(start_date, "%Y-%m-%d")

end_date = pd.Timestamp(start_date) + pd.offsets.MonthEnd(1)
end_date_str = end_date.strftime("%Y-%m-%d")
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")


def clean_data(row):
    try:
        (
            tpep_pickup_datetime,
            tpep_dropoff_datetime,
            trip_distance,
            total_amount,
            congestion_surcharge,
            airport_fee,
        ) = row

        if any(
            [
                tpep_pickup_datetime is None,
                tpep_dropoff_datetime is None,
                trip_distance is None,
                total_amount is None,
                congestion_surcharge is None,
                airport_fee is None,
            ]
        ):
            return False

        if isinstance(tpep_pickup_datetime, str):
            pickup_datetime = datetime.strptime(
                tpep_pickup_datetime, "%Y-%m-%d %H:%M:%S"
            )
        else:
            pickup_datetime = tpep_pickup_datetime

        if isinstance(tpep_dropoff_datetime, str):
            dropoff_datetime = datetime.strptime(
                tpep_dropoff_datetime, "%Y-%m-%d %H:%M:%S"
            )
        else:
            dropoff_datetime = tpep_dropoff_datetime

        if (
            pickup_datetime < start_date
            or pickup_datetime > end_date
            or dropoff_datetime < start_date
            or dropoff_datetime > end_date
        ):
            return False

        if (
            trip_distance < 0
            or total_amount < 0
            or congestion_surcharge < 0
            or airport_fee < 0
        ):
            return False

        return True

    except Exception as e:
        print(f"Error processing row {row}: {e}")
        return False

In [7]:
categories = ["yellow"]

months = (
    pd.date_range(start="2022-05-01", end="2022-06-01", freq="MS")
    .strftime("%Y-%m")
    .tolist()
)

df = None
for category in categories:
    for month in months:
        print(">>>>>", category, month)
        read_df = read_data(category, month)
        if read_df is not None:
            if df is None:
                df = read_df
            else:
                df = df.union(read_df)

>>>>> yellow 2022-05
Attempting to read from: /Users/admin/Desktop/docker/W5M1/parquet_files/yellow_tripdata_2022-05.parquet
>>>>> yellow 2022-06
Attempting to read from: /Users/admin/Desktop/docker/W5M1/parquet_files/yellow_tripdata_2022-06.parquet


In [8]:
rdd = df.rdd.map(
    lambda row: (
        row.tpep_pickup_datetime,
        row.tpep_dropoff_datetime,
        row.trip_distance,
        row.total_amount,
        row.congestion_surcharge,
        row.airport_fee,
    )
)

In [9]:
filtered_rdd = rdd.filter(clean_data)

In [10]:
filtered_rdd.cache()

PythonRDD[13] at RDD at PythonRDD.scala:53

# 총 여행 수 계산

In [11]:
print(f"Total trip count sum: {filtered_rdd.count()}")



Total trip count sum: 3327695


                                                                                

# 총 수익(요금의 합계) 계산

In [12]:
def calculate_totals(row):
    _, _, _, total_amount, congestion_surcharge, airport_fee = row
    return total_amount + congestion_surcharge + airport_fee


total_fee_sum = filtered_rdd.map(calculate_totals).reduce(lambda x, y: x + y)
print(f"Total fee sum: {total_fee_sum}")



Total fee sum: 81259348.9878806


                                                                                

# 평균 여행 거리 계산

In [13]:
trip_distances = filtered_rdd.map(lambda row: row[2])

total_distance = trip_distances.sum()
count = trip_distances.count()

average_trip_distance = total_distance / count if count > 0 else 0

print(f"Average Trip Distance: {average_trip_distance}")



Average Trip Distance: 3.6133158026799284


                                                                                

# 하루의 여행 수, 하루의 총 수익 계산

In [14]:
date_trip_cost_pairs = filtered_rdd.map(
    lambda row: (
        row[0].strftime("%Y-%m-%d"),
        (1, row[3]),
    )
)

date_grouped = date_trip_cost_pairs.reduceByKey(
    lambda a, b: (a[0] + b[0], a[1] + b[1])
)

date_sorted = date_grouped.map(
    lambda x: (datetime.strptime(x[0], "%Y-%m-%d"), x[1])
).sortByKey()

grouped_rdd = date_sorted.map(lambda x: (x[0].strftime("%Y-%m-%d"), x[1]))

for date, (total_trips, total_cost) in grouped_rdd.collect():
    print(f"Date: {date}, Total Trips: {total_trips}, Total Cost: {total_cost}")



Date: 2022-05-01, Total Trips: 94569, Total Cost: 2178003.570001049
Date: 2022-05-02, Total Trips: 90764, Total Cost: 1992913.0900013102
Date: 2022-05-03, Total Trips: 115554, Total Cost: 2457363.950000222
Date: 2022-05-04, Total Trips: 120469, Total Cost: 2530621.289999993
Date: 2022-05-05, Total Trips: 126359, Total Cost: 2721021.469999344
Date: 2022-05-06, Total Trips: 120289, Total Cost: 2579383.0299997
Date: 2022-05-07, Total Trips: 111684, Total Cost: 2218870.5300010415
Date: 2022-05-08, Total Trips: 98581, Total Cost: 2160394.910001212
Date: 2022-05-09, Total Trips: 103954, Total Cost: 2282400.10000089
Date: 2022-05-10, Total Trips: 116799, Total Cost: 2487436.930000178
Date: 2022-05-11, Total Trips: 122745, Total Cost: 2610444.0899997544
Date: 2022-05-12, Total Trips: 127715, Total Cost: 2775632.239999156
Date: 2022-05-13, Total Trips: 124026, Total Cost: 2733966.319999157
Date: 2022-05-14, Total Trips: 119582, Total Cost: 2528301.309999959
Date: 2022-05-15, Total Trips: 101297

                                                                                

In [None]:
spark.stop()