# Homework

In [59]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_trunc
from pyspark.sql import types

In [None]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('homework') \
    .getOrCreate()

## Q1

In [61]:
spark.version

'3.5.3'

## Q2

In [62]:
raw_path='data/raw'
url='https://d37ci6vzurychx.cloudfront.net/trip-data'
file='yellow_tripdata_2024-10.parquet'

In [None]:
!mkdir -p $raw_path
!wget $url/$file -O $raw_path/$file

In [64]:
df_yellow = spark \
    .read.parquet(f"{raw_path}/{file}")

In [18]:
pq_path = 'data/pq'

In [65]:
df_yellow \
    .repartition(4) \
    .write.parquet(pq_path, mode='overwrite')

                                                                                

In [66]:
!ls -hs $pq_path | grep .parquet

23M part-00000-c4a40f06-2ade-4153-a384-2f1fb5fb3fd8-c000.snappy.parquet
23M part-00001-c4a40f06-2ade-4153-a384-2f1fb5fb3fd8-c000.snappy.parquet
23M part-00002-c4a40f06-2ade-4153-a384-2f1fb5fb3fd8-c000.snappy.parquet
23M part-00003-c4a40f06-2ade-4153-a384-2f1fb5fb3fd8-c000.snappy.parquet


## Q3

In [8]:
pq_path = 'data/pq'

df_yellow = spark \
    .read.parquet(f"{pq_path}")

                                                                                

In [50]:
df_yellow.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [55]:
df_yellow \
    .filter(date_trunc('day', col('tpep_pickup_datetime'))  == '2024-10-15') \
    .count()

                                                                                

128893

## Q4

In [60]:
df_yellow \
    .select(
        col('tpep_pickup_datetime'),
        col('tpep_dropoff_datetime'),
        col('PULocationID'),
        col('DOLocationID')
    ) \
    .withColumn(
        'duration_hour',
        round((unix_timestamp('tpep_dropoff_datetime') - unix_timestamp('tpep_pickup_datetime'))/3600, 2)
    ) \
    .orderBy(col('duration_hour').desc()) \
    .limit(1) \
    .show()

[Stage 68:>                                                         (0 + 4) / 4]

+--------------------+---------------------+------------+------------+-------------+
|tpep_pickup_datetime|tpep_dropoff_datetime|PULocationID|DOLocationID|duration_hour|
+--------------------+---------------------+------------+------------+-------------+
| 2024-10-16 13:03:49|  2024-10-23 07:40:53|          48|         265|       162.62|
+--------------------+---------------------+------------+------------+-------------+



                                                                                

## Q5

Port: 4040

In [None]:
# Use this command to check the current session UI port.
spark.sparkContext.uiWebUrl

## Q6

In [18]:
zones_path='../code/zones/'
df_zones = spark.read.parquet(zones_path)

In [19]:
df_zones.printSchema()

root
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [20]:
df_yellow.createOrReplaceTempView('yellow_trips')
df_zones.createOrReplaceTempView('zones')

In [58]:
query = """
    SELECT 
        Zone,
        count(*) as Trips
    FROM yellow_trips t
    LEFT JOIN zones z
    ON t.PULocationID == z.LocationID
    GROUP BY 1
    ORDER BY 2 ASC
    LIMIT 3
""" 
spark.sql(query).show()

+--------------------+-----+
|                Zone|Trips|
+--------------------+-----+
|Governor's Island...|    1|
|       Arden Heights|    2|
|       Rikers Island|    2|
+--------------------+-----+

