In [113]:
from pyspark.sql import SparkSession, types
import pyspark
import pandas as pd
import pyspark.pandas as pspd

In [114]:
!wget -P ../../dataset/raw/fhv/2019/10 https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz

--2024-03-07 15:40:14--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 20.207.73.82
Connecting to github.com (github.com)|20.207.73.82|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-44d1-a138-4e8ea3c3a3b6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240307%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240307T154015Z&X-Amz-Expires=300&X-Amz-Signature=0ad036fedda3a22e339a8a23cd1ff70e88e1a8b8b75fba8b2c150c6bd8fbca76&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhv_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-03-07 15:40:15--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-

In [115]:
spark = SparkSession.builder\
        .appName("homework")\
        .master("local[*]")\
        .config('spark.executor.memory', '10g')\
        .getOrCreate()

In [116]:
spark

In [117]:
spark.version

'3.5.0'

In [118]:
df = spark.read\
    .option('header', 'true')\
    .csv('../../dataset/raw/fhv/2019/10/fhv_tripdata_2019-10.csv.gz')
df.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropOff_datetime', StringType(), True), StructField('PUlocationID', StringType(), True), StructField('DOlocationID', StringType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [119]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('pickup_datetime', types.TimestampType(), True), 
    types.StructField('dropOff_datetime',types.TimestampType(), True), 
    types.StructField('PUlocationID', types.ShortType(), True), 
    types.StructField('DOlocationID', types.ShortType(), True), 
    types.StructField('SR_Flag', types.IntegerType(), True), 
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [120]:
df = spark.read\
    .option('header', 'true')\
    .schema(schema)\
    .csv('../../dataset/raw/fhv/2019/10/fhv_tripdata_2019-10.csv.gz')
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: short (nullable = true)
 |-- DOlocationID: short (nullable = true)
 |-- SR_Flag: integer (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [121]:
df.repartition(6).write.parquet('../../dataset/pq/fhv/2019/10', mode='overwrite')

                                                                                

In [122]:
df.registerTempTable('fhv')



In [123]:

question_3 = spark.sql("""
SELECT
    COUNT(1)
FROM
    fhv
WHERE
    date_trunc('day', pickup_datetime) = CAST('2019-10-15 00:00:00' AS Timestamp)
""")
question_3.show()

[Stage 76:>                                                         (0 + 1) / 1]

+--------+
|count(1)|
+--------+
|   62610|
+--------+



                                                                                

In [124]:
question_4 = spark.sql("""
SELECT
    (CAST(dropOff_datetime AS Long) - CAST(pickup_datetime AS LONG))/3600 AS trip_time,
    pickup_datetime,
    dropOff_datetime
FROM
    fhv
ORDER BY
    trip_time DESC
""")
question_4.show(10)

[Stage 79:>                                                         (0 + 1) / 1]

+------------------+-------------------+-------------------+
|         trip_time|    pickup_datetime|   dropOff_datetime|
+------------------+-------------------+-------------------+
|          631152.5|2019-10-11 18:00:00|2091-10-11 18:30:00|
|          631152.5|2019-10-28 09:00:00|2091-10-28 09:30:00|
| 87672.44083333333|2019-10-31 23:46:33|2029-11-01 00:13:00|
| 70128.02805555555|2019-10-01 21:43:42|2027-10-01 21:45:23|
|            8794.0|2019-10-17 14:00:00|2020-10-18 00:00:00|
| 8784.166666666666|2019-10-26 21:26:00|2020-10-26 21:36:00|
|1464.5344444444445|2019-10-30 12:30:04|2019-12-30 13:02:08|
|1056.8266666666666|2019-10-25 07:04:57|2019-12-08 07:54:33|
|1056.2705555555556|2019-10-25 07:04:57|2019-12-08 07:21:11|
| 793.5530555555556|2019-10-01 13:47:17|2019-11-03 15:20:28|
+------------------+-------------------+-------------------+
only showing top 10 rows



                                                                                

In [125]:
question_4 = spark.sql("""
SELECT
    (CAST(dropOff_datetime AS Long) - CAST(pickup_datetime AS LONG))/3600 AS trip_time,
    pickup_datetime,
    dropOff_datetime
FROM
    fhv
WHERE
    pickup_datetime BETWEEN CAST('2019-10-01' AS Timestamp) AND CAST('2019-10-31' AS Timestamp)
    AND
    dropOff_datetime BETWEEN CAST('2019-10-01' AS Timestamp) AND CAST('2019-10-31' AS Timestamp)
ORDER BY
    trip_time DESC
""")
question_4.show(10)

[Stage 80:>                                                         (0 + 1) / 1]

+------------------+-------------------+-------------------+
|         trip_time|    pickup_datetime|   dropOff_datetime|
+------------------+-------------------+-------------------+
| 469.6666666666667|2019-10-04 23:00:00|2019-10-24 12:40:00|
| 432.6688888888889|2019-10-02 09:00:01|2019-10-20 09:40:09|
|398.96666666666664|2019-10-07 20:47:00|2019-10-24 11:45:00|
|396.06666666666666|2019-10-07 23:41:00|2019-10-24 11:45:00|
|            395.05|2019-10-08 00:42:00|2019-10-24 11:45:00|
|384.68333333333334|2019-10-01 08:45:00|2019-10-17 09:26:00|
| 355.1322222222222|2019-10-05 01:38:29|2019-10-19 20:46:25|
|            338.55|2019-10-03 06:45:00|2019-10-17 09:18:00|
| 336.4166666666667|2019-10-10 06:45:00|2019-10-24 07:10:00|
|            326.95|2019-10-10 20:49:00|2019-10-24 11:46:00|
+------------------+-------------------+-------------------+
only showing top 10 rows



                                                                                

In [126]:
!wget -P ../../dataset/zones https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv

--2024-03-07 15:40:44--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 20.207.73.82
Connecting to github.com (github.com)|20.207.73.82|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240307%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240307T154044Z&X-Amz-Expires=300&X-Amz-Signature=d97d7247e0fa9079f9a70533491a9abb7c374d40c0edc6a7a355eb0e23df4971&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2024-03-07 15:40:44--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6e

In [127]:
df_zones = pspd.read_csv('../../dataset/zones//taxi_zone_lookup.csv')
df_zones = df_zones.to_spark()
df_zones = df_zones.withColumn('LocationID', df_zones['LocationID'].cast(types.ShortType()))
df_zones.printSchema()

root
 |-- LocationID: short (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)





In [128]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: short (nullable = true)
 |-- DOlocationID: short (nullable = true)
 |-- SR_Flag: integer (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [129]:
df_zones.createOrReplaceTempView('zones')

In [130]:
joined = df.join(df_zones, df.PUlocationID == df_zones.LocationID)
joined.show(10)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+----------+-------+---------------+------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|LocationID|Borough|           Zone|service_zone|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+----------+-------+---------------+------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   NULL|                B00009|       264|Unknown|             NV|         N/A|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   NULL|                B00013|       264|Unknown|             NV|         N/A|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   NULL|                B00014|       264|Unknown|             NV|      

In [131]:
joined.createOrReplaceTempView('df_joined')

In [141]:
spark.sql("""
SELECT
    Zone,
    COUNT(PUlocationID) AS frequency
FROM
    df_joined
GROUP BY
    1
ORDER BY
    COUNT(PUlocationID)
LIMIT 10
""").show()

[Stage 102:>                                                        (0 + 1) / 1]

+--------------------+---------+
|                Zone|frequency|
+--------------------+---------+
|         Jamaica Bay|        1|
|Governor's Island...|        2|
| Green-Wood Cemetery|        5|
|       Broad Channel|        8|
|     Highbridge Park|       14|
|        Battery Park|       15|
|Saint Michaels Ce...|       23|
|Breezy Point/Fort...|       25|
|Marine Park/Floyd...|       26|
|        Astoria Park|       29|
+--------------------+---------+



                                                                                