In [21]:
from pyspark.sql.functions import col, sum, mean, round, unix_timestamp, date_format, to_timestamp, hour, when, avg
from pyspark.sql import SparkSession
from datetime import datetime
import pandas as pd
import numpy as np

In [22]:
spark = (
    SparkSession.builder.appName("TLC_Analysis")
    .master("local[*]")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.port", "7077")
    .config("spark.ui.port", "4050")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

In [23]:
def read_data(file_name, date):
    file_path = f"your file path"  
    print(f"Attempting to read from: {file_path}")

    try:
        read_df = spark.read.parquet(file_path)
        return read_df
    except Exception:
        print(f"File not found: {file_path}")
        return None

In [24]:
categories = ["yellow"]

months = (
    pd.date_range(start="2021-01-01", end="2021-03-01", freq="MS")
    .strftime("%Y-%m")
    .tolist()
)

df = None
for category in categories:
    for month in months:
        print(">>>>>", category, month)
        read_df = read_data(category, month)
        if read_df is not None:
            if df is None:
                df = read_df
            else:
                df = df.union(read_df)
df = df.select(
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "total_amount",
    "congestion_surcharge",
    "airport_fee",
)

>>>>> yellow 2021-01
Attempting to read from: /Users/munsoyun/Desktop/Docker/W5M2/parquet_files/yellow_tripdata_2021-01.parquet
>>>>> yellow 2021-02
Attempting to read from: /Users/munsoyun/Desktop/Docker/W5M2/parquet_files/yellow_tripdata_2021-02.parquet
>>>>> yellow 2021-03
Attempting to read from: /Users/munsoyun/Desktop/Docker/W5M2/parquet_files/yellow_tripdata_2021-03.parquet


In [25]:
df.printSchema()

root
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [26]:
weather = spark.read.csv("your file path", header=True)
weather = weather.select("time", "temperature_2m (°C)", "rain (mm)")
weather = weather.withColumn("weather_hour", date_format(to_timestamp("time"), "yyyy-MM-dd HH:00:00"))

In [27]:
df = df.dropna()

start_date = "2021-01-01"
end_date = "2021-03-31"

filterd_df = df.filter(
    (col("passenger_count") >= 0)
    & (col("total_amount") >= 0)
    & (col("congestion_surcharge") >= 0)
    & (col("airport_fee") >= 0)
    & (col("tpep_pickup_datetime") >= start_date)
    & (col("tpep_pickup_datetime") <= end_date)
    & (col("tpep_dropoff_datetime") >= start_date)
    & (col("tpep_dropoff_datetime") <= end_date)
)

filterd_df = filterd_df.withColumn(
    "total_fee",
    (col("total_amount") + col("congestion_surcharge") + col("airport_fee")),
)

filterd_df = filterd_df.withColumn(
        "trip_duration",
        (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60,
    )
filterd_df = filterd_df.withColumn("total_fee", col("total_fee").cast("double"))
filterd_df = filterd_df.filter(col("total_fee").isNotNull())
filterd_df = filterd_df.select(
    "tpep_pickup_datetime", "trip_duration", "passenger_count", "total_fee"
)

In [28]:
filterd_df.printSchema()
filterd_df.cache()

root
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- total_fee: double (nullable = true)



DataFrame[tpep_pickup_datetime: timestamp_ntz, trip_duration: double, passenger_count: double, total_fee: double]

In [29]:
filterd_df.show()

                                                                                

+--------------------+-------------------+---------------+---------+
|tpep_pickup_datetime|      trip_duration|passenger_count|total_fee|
+--------------------+-------------------+---------------+---------+
| 2021-01-05 18:55:22|  9.583333333333334|            2.0|    25.65|
| 2021-01-05 19:58:59|  5.983333333333333|            1.0|     13.8|
| 2021-01-05 20:12:38|              10.25|            1.0|    17.85|
| 2021-01-05 20:26:52| 24.583333333333332|            1.0|    42.35|
| 2021-01-12 18:34:41| 2.6166666666666667|            1.0|      6.3|
| 2021-02-21 05:36:21|  57.93333333333333|            3.0|     60.8|
| 2021-03-09 16:30:06|                0.1|            1.0|      3.3|
| 2021-03-13 15:16:15| 10.533333333333333|            1.0|      9.3|
| 2021-03-20 13:47:19| 12.466666666666667|            1.0|     14.8|
| 2021-03-20 14:14:30| 16.766666666666666|            2.0|     22.3|
| 2021-03-20 15:01:29|              13.45|            2.0|     21.6|
| 2021-03-20 15:32:42|            

In [30]:
all_passenger_count = filterd_df.count()
over_two_passenger_count = filterd_df.filter(col("passenger_count") >= 2).count()

print(f"All passenger count: {all_passenger_count}")
print(f"Over two passenger count: {over_two_passenger_count}")

All passenger count: 115571
Over two passenger count: 25135


In [31]:
total_revenue = filterd_df.agg(sum("total_fee")).collect()[0][0]
print(f"Total Revenue: {total_revenue}")

Total Revenue: 2247081.3100010254


In [32]:
mean_trip_duration = filterd_df.agg(mean("trip_duration")).collect()[0][0]
print(f"Mean Trip Duration: {mean_trip_duration}")

Mean Trip Duration: 12.663630726277693


In [33]:
taxi_df = filterd_df.withColumn("pickup_hour", date_format(to_timestamp("tpep_pickup_datetime"), "yyyy-MM-dd HH:00:00"))

hourly_taxi_df = taxi_df.groupBy("pickup_hour").agg(
    round(mean("total_fee"), 3).alias("mean_hourly_fee"),
    round(mean("trip_duration"), 2).alias("mean_trip_duration"),
    round(mean("passenger_count"), 2).alias("mean_passenger_count")
)
final_df = hourly_taxi_df.join(weather, hourly_taxi_df.pickup_hour == weather.weather_hour).drop("time", "weather_hour")
final_df = final_df.orderBy("pickup_hour")
final_df.cache()

DataFrame[pickup_hour: string, mean_hourly_fee: double, mean_trip_duration: double, mean_passenger_count: double, temperature_2m (°C): string, rain (mm): string]

In [34]:
final_df.show()

                                                                                

+-------------------+---------------+------------------+--------------------+-------------------+---------+
|        pickup_hour|mean_hourly_fee|mean_trip_duration|mean_passenger_count|temperature_2m (°C)|rain (mm)|
+-------------------+---------------+------------------+--------------------+-------------------+---------+
|2021-01-05 18:00:00|          25.65|              9.58|                 2.0|                4.3|     0.00|
|2021-01-05 19:00:00|           13.8|              5.98|                 1.0|                4.2|     0.00|
|2021-01-05 20:00:00|           30.1|             17.42|                 1.0|                4.4|     0.00|
|2021-01-12 18:00:00|            6.3|              2.62|                 1.0|                4.6|     0.00|
|2021-02-21 05:00:00|           60.8|             57.93|                 3.0|               -3.7|     0.00|
|2021-03-09 16:00:00|            3.3|               0.1|                 1.0|               14.3|     0.00|
|2021-03-13 15:00:00|       

In [35]:
final_df = final_df.withColumn(
    "rain_category",
    when(col("rain (mm)") == 0.00, "No Rain")
    .when((col("rain (mm)") > 0.00) & (col("rain (mm)") <= 5.00), "Little Rain")
    .otherwise("Heavy Rain"),
)

result1_df = final_df.groupBy("rain_category").agg(
    avg("mean_trip_duration").alias("avg_mean_trip_duration"),
    avg("mean_hourly_fee").alias("avg_mean_hourly_fee"),
)

result1_df.show()



+-------------+----------------------+-------------------+
|rain_category|avg_mean_trip_duration|avg_mean_hourly_fee|
+-------------+----------------------+-------------------+
|      No Rain|    13.624059405940598|  21.89006930693069|
|  Little Rain|    11.825714285714286| 21.384642857142858|
+-------------+----------------------+-------------------+



                                                                                

In [36]:
final_df = final_df.withColumn(
    "temperature_category",
    when(col("temperature_2m (°C)") <= 24.1, "Not Hot")
    .when(
        (col("temperature_2m (°C)") > 24.1) & (col("temperature_2m (°C)") <= 28.4),
        "Moderate",
    )
    .otherwise("Hot"),
)

result2_df = final_df.groupBy("temperature_category").agg(
    avg("mean_trip_duration").alias("avg_mean_trip_duration"),
    avg("mean_hourly_fee").alias("avg_mean_hourly_fee"),
)

result2_df.show()



+--------------------+----------------------+-------------------+
|temperature_category|avg_mean_trip_duration|avg_mean_hourly_fee|
+--------------------+----------------------+-------------------+
|             Not Hot|    13.394385964912283|  21.84852631578948|
|            Moderate|                 14.63|              19.55|
+--------------------+----------------------+-------------------+



                                                                                

In [37]:
spark.stop()