In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, mean, round, unix_timestamp, date_format, to_timestamp, hour, when, avg
from datetime import datetime
import pandas as pd
import numpy as np

In [36]:
spark = (
    SparkSession.builder.appName("TLC DataSet Analyze")
    .master("local[*]")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.port", "7077")
    .config("spark.ui.port", "4050")
    .config("spark.driver.memory", "15g")
    .config("spark.executor.memory", "15g")
    .getOrCreate()
)

In [37]:
def read_data(file_name, date):
    file_path = f"/Users/admin/Desktop/docker/W5M2/parquet_files/{file_name}_tripdata_{date}.parquet"  # 로컬
    print(f"Attempting to read from: {file_path}")

    try:
        read_df = spark.read.parquet(file_path)
        return read_df
    except Exception:
        print(f"File not found: {file_path}")
        return None

In [38]:
categories = ["yellow"]

months = (
    pd.date_range(start="2022-05-01", end="2022-08-01", freq="MS")
    .strftime("%Y-%m")
    .tolist()
)

df = None
for category in categories:
    for month in months:
        print(">>>>>", category, month)
        read_df = read_data(category, month)
        if read_df is not None:
            if df is None:
                df = read_df
            else:
                df = df.union(read_df)
df = df.select(
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "total_amount",
    "congestion_surcharge",
    "airport_fee",
)

>>>>> yellow 2022-05
Attempting to read from: /Users/admin/Desktop/docker/W5M2/parquet_files/yellow_tripdata_2022-05.parquet
>>>>> yellow 2022-06
Attempting to read from: /Users/admin/Desktop/docker/W5M2/parquet_files/yellow_tripdata_2022-06.parquet
>>>>> yellow 2022-07
Attempting to read from: /Users/admin/Desktop/docker/W5M2/parquet_files/yellow_tripdata_2022-07.parquet
>>>>> yellow 2022-08
Attempting to read from: /Users/admin/Desktop/docker/W5M2/parquet_files/yellow_tripdata_2022-08.parquet


In [39]:
weather = spark.read.csv("/Users/admin/Desktop/docker/W5M2/weather.csv", header=True)
weather = weather.select("time", "temperature_2m (°C)", "rain (mm)")
weather = weather.withColumn("weather_hour", date_format(to_timestamp("time"), "yyyy-MM-dd HH:00:00"))

# 데이터 정제 및 전처리

In [40]:
df = df.dropna()

start_date = "2022-05-01"
end_date = "2022-08-31"

filterd_df = df.filter(
    (col("passenger_count") >= 0)
    & (col("total_amount") >= 0)
    & (col("congestion_surcharge") >= 0)
    & (col("airport_fee") >= 0)
    & (col("tpep_pickup_datetime") >= start_date)
    & (col("tpep_pickup_datetime") <= end_date)
    & (col("tpep_dropoff_datetime") >= start_date)
    & (col("tpep_dropoff_datetime") <= end_date)
)

filterd_df = filterd_df.withColumn(
    "total_fee",
    (col("total_amount") + col("congestion_surcharge") + col("airport_fee")),
)

filterd_df = filterd_df.withColumn(
        "trip_duration",
        (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60,
    )

# total_fee를 double 타입으로 변환하고 변환 실패 시 null로 설정
filterd_df = filterd_df.withColumn("total_fee", col("total_fee").cast("double"))

# 유효한 숫자만 남기기 (null 값은 제거)
filterd_df = filterd_df.filter(col("total_fee").isNotNull())


filterd_df = filterd_df.select(
    "tpep_pickup_datetime", "trip_duration", "passenger_count", "total_fee"
)

### data schema

In [41]:
filterd_df.printSchema()

root
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- total_fee: double (nullable = true)



In [42]:
filterd_df.cache()

24/08/11 22:33:57 WARN CacheManager: Asked to cache already cached data.


DataFrame[tpep_pickup_datetime: timestamp_ntz, trip_duration: double, passenger_count: double, total_fee: double]

# 전체 여행 수, 승객이 2명 이상인 여행 수

In [43]:
all_passenger_count = filterd_df.count()
over_two_passenger_count = filterd_df.filter(col("passenger_count") >= 2).count()

print(f"All passenger count: {all_passenger_count}")
print(f"Over two passenger count: {over_two_passenger_count}")

All passenger count: 12821748
Over two passenger count: 3169436


# 전체 기간 총 수익

In [44]:
total_revenue = filterd_df.agg(sum("total_fee")).collect()[0][0]
print(f"Total Revenue: {total_revenue}")

Total Revenue: 310560499.022171


# 전체 기간 평균 여행 시간

In [45]:
mean_trip_duration = filterd_df.agg(mean("trip_duration")).collect()[0][0]
print(f"Mean Trip Duration: {mean_trip_duration}")

Mean Trip Duration: 17.487566865558218


### 일별 평균 여행시간, 여행요금, 승객수와 날씨 데이터 결합

In [46]:
taxi_df = filterd_df.withColumn("pickup_hour", date_format(to_timestamp("tpep_pickup_datetime"), "yyyy-MM-dd HH:00:00"))

hourly_taxi_df = taxi_df.groupBy("pickup_hour").agg(
    round(mean("total_fee"), 3).alias("mean_hourly_fee"),
    round(mean("trip_duration"), 2).alias("mean_trip_duration"),
    round(mean("passenger_count"), 2).alias("mean_passenger_count")
)
final_df = hourly_taxi_df.join(weather, hourly_taxi_df.pickup_hour == weather.weather_hour).drop("time", "weather_hour")
final_df = final_df.orderBy("pickup_hour")
final_df.cache()

24/08/11 22:33:59 WARN CacheManager: Asked to cache already cached data.


DataFrame[pickup_hour: string, mean_hourly_fee: double, mean_trip_duration: double, mean_passenger_count: double, temperature_2m (°C): string, rain (mm): string]

In [47]:
final_df.show()

+-------------------+---------------+------------------+--------------------+-------------------+---------+
|        pickup_hour|mean_hourly_fee|mean_trip_duration|mean_passenger_count|temperature_2m (°C)|rain (mm)|
+-------------------+---------------+------------------+--------------------+-------------------+---------+
|2022-05-01 00:00:00|         23.076|             17.44|                1.46|               15.3|     0.00|
|2022-05-01 01:00:00|         21.549|              14.9|                1.48|               13.9|     0.00|
|2022-05-01 02:00:00|         21.174|             16.09|                1.47|               12.7|     0.00|
|2022-05-01 03:00:00|         22.811|             12.75|                1.44|               11.3|     0.00|
|2022-05-01 04:00:00|         26.346|             15.52|                1.44|               10.8|     0.00|
|2022-05-01 05:00:00|         34.028|             23.64|                1.42|               10.1|     0.00|
|2022-05-01 06:00:00|       

In [50]:
final_df = final_df.withColumn(
    "rain_category",
    when(col("rain (mm)") == 0.00, "No Rain")
    .when((col("rain (mm)") > 0.00) & (col("rain (mm)") <= 5.00), "Little Rain")
    .otherwise("Heavy Rain"),
)

result1_df = final_df.groupBy("rain_category").agg(
    avg("mean_trip_duration").alias("avg_mean_trip_duration"),
    avg("mean_hourly_fee").alias("avg_mean_hourly_fee"),
)

result1_df.show()

+-------------+----------------------+-------------------+
|rain_category|avg_mean_trip_duration|avg_mean_hourly_fee|
+-------------+----------------------+-------------------+
|      No Rain|     17.11070719110051|  25.19904886769963|
|  Little Rain|     17.09536585365854|  24.99237804878049|
|   Heavy Rain|                 19.25|             24.379|
+-------------+----------------------+-------------------+



In [57]:
final_df = final_df.withColumn(
    "temperature_category",
    when(col("temperature_2m (°C)") <= 24.1, "Not Hot")
    .when(
        (col("temperature_2m (°C)") > 24.1) & (col("temperature_2m (°C)") <= 28.4),
        "Moderate",
    )
    .otherwise("Hot"),
)

result2_df = final_df.groupBy("temperature_category").agg(
    avg("mean_trip_duration").alias("avg_mean_trip_duration"),
    avg("mean_hourly_fee").alias("avg_mean_hourly_fee"),
)

result2_df.show()

+--------------------+----------------------+-------------------+
|temperature_category|avg_mean_trip_duration|avg_mean_hourly_fee|
+--------------------+----------------------+-------------------+
|             Not Hot|    17.321037394451153|  25.42251387213509|
|            Moderate|    16.930277136258663|   25.1263533487298|
|                 Hot|    16.624009900990103| 24.226014851485143|
+--------------------+----------------------+-------------------+



In [58]:
result2_df.write.csv(
    path="/Users/admin/Desktop/docker/W5M2/csv_files", mode="overwrite", header=True
)

In [None]:
spark.stop()