### Library imports

In [21]:
import pyspark
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, col, max as spark_max

### Spark Session

In [2]:
spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()

print(f"Spark version: {spark.version}")

26/02/26 14:00:13 WARN Utils: Your hostname, LAR-SRODRIGUEZ resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/02/26 14:00:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark version: 3.5.4-amzn-0


### Partitions

In [7]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-11.parquet"
output_file = "yellow_tripdata_2025-11.parquet"

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(output_file, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
print("Descarga completada:", output_file)

Descarga completada: yellow_tripdata_2025-11.parquet


In [14]:
# Read November 2025 Yellow CSV into a Spark DataFrame
input_path = output_file  # Update path if needed
df = spark.read.option('header', True).parquet(input_path)

# Repartition to 4 partitions
df_repart = df.repartition(4)

# Save to Parquet
output_path = '/home/hadoop/workspace/notebooks/yellow_november_2025_parquet'  # Update path if needed
df_repart.write.mode('overwrite').parquet(output_path)

df_repart.show(5)



+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|       2| 2025-11-02 08:11:08|  2025-11-02 08:15:21|              1|         1.24|         1|                 N|         186|    

                                                                                

### Count records

In [20]:
df.filter(
    (col("tpep_pickup_datetime") >= "2025-11-15") &
    (col("tpep_pickup_datetime") < "2025-11-16")
).count()

                                                                                

162604

### Longest trip

In [23]:
longest_trip_hours = df.withColumn(
    "trip_hours",
    (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 3600
).agg(
    spark_max("trip_hours").alias("longest_trip_hours")
)

In [25]:
longest_trip_hours.show()

+------------------+
|longest_trip_hours|
+------------------+
| 90.64666666666666|
+------------------+



###  Least frequent pickup location zone 

In [None]:
least_frequent_pickup_zone = df.groupBy("PULocationID") \
  .count() \
  .orderBy("count", ascending=True) \
  .show(1) # --> PULocationID 5 has the least pickups.

+------------+-----+
|PULocationID|count|
+------------+-----+
|           5|    1|
+------------+-----+
only showing top 1 row



#### Download CSV zones

In [29]:
!wget https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv -O taxi_zone_lookup.csv

--2026-02-26 14:29:30--  https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.140.127, 3.163.140.18, 3.163.140.37, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.140.127|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12331 (12K) [text/csv]
Saving to: ‘taxi_zone_lookup.csv’


2026-02-26 14:29:31 (10.8 MB/s) - ‘taxi_zone_lookup.csv’ saved [12331/12331]



In [30]:
# Read the lookup CSV
zone_df = spark.read.option("header", True).csv("taxi_zone_lookup.csv")

# Find least frequent pickup zone
least_frequent_pickup_zone = df.groupBy("PULocationID") \
    .count() \
    .orderBy("count", ascending=True) \
    .limit(1)

# Join with zone lookup to get the zone name
result = least_frequent_pickup_zone.join(
    zone_df,
    least_frequent_pickup_zone.PULocationID == zone_df.LocationID,
    how="left"
).select(
    least_frequent_pickup_zone.PULocationID,
    "Zone",
    "count"
)

result.show()

+------------+-------------+-----+
|PULocationID|         Zone|count|
+------------+-------------+-----+
|           5|Arden Heights|    1|
+------------+-------------+-----+

