In [6]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.4.tar.gz (317.3 MB)
[K     |████████████████████████████████| 317.3 MB 187 kB/s eta 0:00:011
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 32.6 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel fodone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.4-py2.py3-none-any.whl size=317849789 sha256=de04f077a97a37dcbee8ba1c19a4813cd5e0aa4e3d9fc11a2fadcc8deac85d93
  Stored in directory: /root/.cache/pip/wheels/b9/0f/0a/1bf9096f5b49f278182d7fe905a82209f2090edb24a7352b72
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.4
Note: you may need to restart the kernel to use updated packages.


In [8]:
from pyspark.sql import SparkSession

# Spark 세션 생성
spark = SparkSession.builder.appName("NYC_Taxi_Analysis").getOrCreate()

# Parquet 파일 로드
df = spark.read.parquet("/opt/spark/yellow_tripdata_2024-01.parquet")

# 데이터 확인
df.show(5)


+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2024-01-01 00:57:55|  2024-01-01 01:17:43|              1|         1.72|         1|                 N|         186|          79|           2|       17.7|  1.0|    0.5|       0.

In [12]:
df = df.select("tpep_pickup_datetime", "tpep_dropoff_datetime", "trip_distance")

df.show(5)

+--------------------+---------------------+-------------+
|tpep_pickup_datetime|tpep_dropoff_datetime|trip_distance|
+--------------------+---------------------+-------------+
| 2024-01-01 00:57:55|  2024-01-01 01:17:43|         1.72|
| 2024-01-01 00:03:00|  2024-01-01 00:09:36|          1.8|
| 2024-01-01 00:17:06|  2024-01-01 00:35:01|          4.7|
| 2024-01-01 00:36:38|  2024-01-01 00:44:56|          1.4|
| 2024-01-01 00:46:51|  2024-01-01 00:52:57|          0.8|
+--------------------+---------------------+-------------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import col, to_timestamp

# 날짜 데이터 변환
df = df.withColumn("pickup_time", to_timestamp(col("tpep_pickup_datetime"))) \
       .withColumn("dropoff_time", to_timestamp(col("tpep_dropoff_datetime"))) \
       .drop("tpep_pickup_datetime", "tpep_dropoff_datetime")
df.show(5)

+-------------+-------------------+-------------------+
|trip_distance|        pickup_time|       dropoff_time|
+-------------+-------------------+-------------------+
|         1.72|2024-01-01 00:57:55|2024-01-01 01:17:43|
|          1.8|2024-01-01 00:03:00|2024-01-01 00:09:36|
|          4.7|2024-01-01 00:17:06|2024-01-01 00:35:01|
|          1.4|2024-01-01 00:36:38|2024-01-01 00:44:56|
|          0.8|2024-01-01 00:46:51|2024-01-01 00:52:57|
+-------------+-------------------+-------------------+
only showing top 5 rows



In [15]:
# 비정상적인 데이터 제거 (이동 거리 0 이하, 승객 수 0 이하 제거)
df = df.filter((col("trip_distance") > 0) & (col("passenger_count") > 0))

# 결과 확인
df.show(5)

+-------------+-------------------+-------------------+
|trip_distance|        pickup_time|       dropoff_time|
+-------------+-------------------+-------------------+
|         1.72|2024-01-01 00:57:55|2024-01-01 01:17:43|
|          1.8|2024-01-01 00:03:00|2024-01-01 00:09:36|
|          4.7|2024-01-01 00:17:06|2024-01-01 00:35:01|
|          1.4|2024-01-01 00:36:38|2024-01-01 00:44:56|
|          0.8|2024-01-01 00:46:51|2024-01-01 00:52:57|
+-------------+-------------------+-------------------+
only showing top 5 rows



In [16]:
# 평균 이동 시간 계산

from pyspark.sql.functions import unix_timestamp, avg

df = df.withColumn("trip_duration", unix_timestamp(col("dropoff_time")) - unix_timestamp(col("pickup_time")))

# 평균 이동 시간 계산
avg_duration = df.select(avg("trip_duration")).collect()[0][0]
print(f"평균 이동 시간: {avg_duration} 초")


평균 이동 시간: 944.0489715314447 초


In [18]:
# 평균 이동 거리 계산

avg_distance = df.select(avg("trip_distance")).collect()[0][0]
print(f"평균 이동 거리: {avg_distance} 마일")


평균 이동 거리: 3.3032988599662114 마일


In [19]:
df.write.mode("overwrite").parquet("output/nyc_taxi_analysis.parquet")

                                                                                