1. 필요한 라이브러리 및 SparkSession 초기화

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum as _sum, count, to_date

# Spark Session을 로컬 모드에서 초기화합니다.
spark = SparkSession.builder \
    .appName("NYC_TLC_Data_Analysis") \
    .master("local[*]") \
    .getOrCreate()

24/08/26 07:35:48 WARN Utils: Your hostname, admins-MacBook-Pro-5.local resolves to a loopback address: 127.0.0.1; using 192.168.219.100 instead (on interface en0)
24/08/26 07:35:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 07:35:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/26 07:35:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


2. 데이터 로딩

In [2]:
# TLC 여행 기록 데이터를 DataFrame에 로드합니다.
file_path = '/Users/admin/softeer/week5/yellow_tripdata_2024-05.parquet'
df = spark.read.parquet(file_path)

# 스키마를 확인하여 데이터가 올바르게 로드되었는지 확인합니다.
df.printSchema()
df.show(5)


                                                                                

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------

3. 데이터 클리닝

In [4]:
numerical_columns = ['passenger_count', 'trip_distance', 'fare_amount', 
                     'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 
                     'improvement_surcharge', 'total_amount', 'congestion_surcharge', 
                     'Airport_fee']
minmax_columns = []

for column_name in numerical_columns:
    minmax_columns.append(min(column_name))
    minmax_columns.append(max(column_name))


In [11]:
# 필수 열에서 null 값이나 잘못된 값이 있는 행을 제거합니다.
df_filtered = df.filter(
    (col('tpep_pickup_datetime').isNotNull()) &
    (col('tpep_dropoff_datetime').isNotNull()) &
    (col('passenger_count') > 0) &
    (col('trip_distance') > 0) &
    (col('fare_amount') > 0)
).filter(
    (col('trip_distance') < 100) &
    (col('fare_amount') < 500) &
    (col('passenger_count') >= 2)
).cache()
df_filtered.show(5)




+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2024-05-01 00:08:30|  2024-05-01 00:21:42|              2|          2.7|         1|                 N|         231|          68|           1|       15.6|  3.5|    0.5|       4.

                                                                                

In [12]:
df_filtered.count()

688796

4. 데이터 변환

In [13]:
# 1) 필터링: 승객이 두 명 이상인 여행을 필터링합니다.
df_filtered = df_filtered.filter(col('passenger_count') >= 2)

# 2) 집계: 총 여행 횟수, 평균 여행 거리, 총 수익을 계산합니다.
df_aggregated = df_filtered.groupBy(to_date(col("tpep_pickup_datetime")).alias("date")) \
    .agg(
        count("*").alias("total_trips"),
        avg("trip_distance").alias("avg_trip_distance"),
        _sum("fare_amount").alias("total_revenue")
    )

# 3) 조인 (선택적): 다른 데이터 세트가 있다면 여기에 조인 작업을 추가할 수 있습니다.
# 예를 들어, 날씨 데이터와의 조인 작업.

df_aggregated.show(3)

+----------+-----------+-----------------+------------------+
|      date|total_trips|avg_trip_distance|     total_revenue|
+----------+-----------+-----------------+------------------+
|2024-05-05|      24881|4.081323499859337| 542075.4700000007|
|2024-05-08|      19933|3.983657753474103|456786.17000000156|
|2024-05-09|      22364|4.025778036129481|  515127.359999999|
+----------+-----------+-----------------+------------------+
only showing top 3 rows



5. 액션

In [14]:
# 1) 수집: 집계된 데이터의 샘플을 수집합니다.
sample_data = df_aggregated.take(5)
print(sample_data)

# 2) 쓰기: 최종 변환된 DataFrame을 원하는 저장 형식으로 저장합니다.
output_path = "/Users/admin/softeer/week5/output"
df_aggregated.write.mode("overwrite").csv(f"{output_path}/daily_metrics_df", header=True)



[Row(date=datetime.date(2024, 5, 5), total_trips=24881, avg_trip_distance=4.081323499859337, total_revenue=542075.4700000007), Row(date=datetime.date(2024, 5, 8), total_trips=19933, avg_trip_distance=3.983657753474103, total_revenue=456786.17000000156), Row(date=datetime.date(2024, 5, 9), total_trips=22364, avg_trip_distance=4.025778036129481, total_revenue=515127.359999999), Row(date=datetime.date(2024, 5, 10), total_trips=24976, avg_trip_distance=3.7462780269058205, total_revenue=544226.6399999999), Row(date=datetime.date(2024, 5, 1), total_trips=19755, avg_trip_distance=4.100026828650983, total_revenue=454812.7599999967)]


6. DAG 최적화

In [15]:
# 동일한 DataFrame에서 반복되는 작업을 최적화하기 위해 cache()를 사용합니다.
df_filtered = df_filtered
df_filtered.collect()

# 변환 후 최적화된 실행 계획을 확인하려면, 다음과 같은 코드를 사용할 수 있습니다.
df_filtered.explain(True)
df_filtered.show(2)

== Parsed Logical Plan ==
'Filter ('passenger_count >= 2)
+- Filter (((trip_distance#4 < cast(100 as double)) AND (fare_amount#10 < cast(500 as double))) AND (passenger_count#3L >= cast(2 as bigint)))
   +- Filter ((((isnotnull(tpep_pickup_datetime#1) AND isnotnull(tpep_dropoff_datetime#2)) AND (passenger_count#3L > cast(0 as bigint))) AND (trip_distance#4 > cast(0 as double))) AND (fare_amount#10 > cast(0 as double)))
      +- Relation [VendorID#0,tpep_pickup_datetime#1,tpep_dropoff_datetime#2,passenger_count#3L,trip_distance#4,RatecodeID#5L,store_and_fwd_flag#6,PULocationID#7,DOLocationID#8,payment_type#9L,fare_amount#10,extra#11,mta_tax#12,tip_amount#13,tolls_amount#14,improvement_surcharge#15,total_amount#16,congestion_surcharge#17,Airport_fee#18] parquet

== Analyzed Logical Plan ==
VendorID: int, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: bigint, trip_distance: double, RatecodeID: bigint, store_and_fwd_flag: string, PULocationID: i

7. 지연 평가

In [16]:
# 지연 평가를 설명하기 위한 예시:
# 이 코드는 실제로 변환을 실행하지 않으며, 동작이 호출될 때까지 계산되지 않습니다.
df_lazy = df_filtered.withColumn("new_column", col("trip_distance") * 2)

# 동작이 호출될 때 변환이 실행됩니다.
df_lazy.show(5)


+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|new_column|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+----------+
|       1| 2024-05-01 00:08:30|  2024-05-01 00:21:42|              2|          2.7|         1|                 N|         231|          68|           1|  

8. 결과 저장

In [17]:
output_path = "/Users/admin/softeer/week5/output"
df_aggregated.write.mode("overwrite").csv(f"{output_path}/daily_metrics_df", header=True)
