In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("trip_count_by_zone_sql").getOrCreate()

In [3]:
directory = "/Users/keon/fastcampus/data-engineering/01-spark/data"
trip_file = "fhvhv_tripdata_2020-03.csv"
zone_file = "taxi+_zone_lookup.csv"

In [4]:
trip_data = spark.read.csv(f"file:///{directory}/{trip_file}", inferSchema = True, header = True)
zone_data = spark.read.csv(f"file:///{directory}/{zone_file}", inferSchema = True, header = True)

In [5]:
trip_data.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

In [6]:
zone_data.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [7]:
trip_data.createOrReplaceTempView("trip_data")
zone_data.createOrReplaceTempView("zone_data")

In [8]:
spark.sql("select * from trip_data limit 5").show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

In [9]:
spark.sql("select * from zone_data limit 5").show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+



In [10]:
spark.sql("select borough, count(*) as trips from \
(select zone_data.Borough as borough \
from trip_data join zone_data on trip_data.PULocationID = zone_data.LocationID) \
group by borough").show()

+-------------+-------+
|      borough|  trips|
+-------------+-------+
|       Queens|2437383|
|          EWR|    362|
|      Unknown|    845|
|     Brooklyn|3735764|
|Staten Island| 178818|
|    Manhattan|4953140|
|        Bronx|2086592|
+-------------+-------+



In [11]:
spark.sql("select borough, count(*) as trips from \
(select zone_data.Borough as borough \
from trip_data join zone_data on trip_data.DOLocationID = zone_data.LocationID) \
group by borough").show()

+-------------+-------+
|      borough|  trips|
+-------------+-------+
|       Queens|2468408|
|          EWR|  65066|
|      Unknown| 387759|
|     Brooklyn|3696682|
|Staten Island| 177727|
|    Manhattan|4553776|
|        Bronx|2043486|
+-------------+-------+



In [12]:
spark.sql("SELECT zone_data.Zone, count(*) AS trips\
 FROM trip_data JOIN zone_data ON trip_data.PULocationID = zone_data.LocationID \
 WHERE trip_data.hvfhs_license_num = 'HV0003' \
 GROUP BY zone_data.Zone order by trips desc").show()

+--------------------+------+
|                Zone| trips|
+--------------------+------+
| Crown Heights North|163091|
|       East New York|134198|
|         JFK Airport|114179|
|        East Village|112017|
|      Bushwick South|110150|
|Central Harlem North|108070|
|   LaGuardia Airport|104119|
|Washington Height...| 97324|
|Flatbush/Ditmas Park| 95724|
|            Canarsie| 94484|
|TriBeCa/Civic Center| 94155|
|             Astoria| 92676|
|             Bedford| 90352|
|      Midtown Center| 90261|
|  Stuyvesant Heights| 88749|
|            Union Sq| 88372|
|Times Sq/Theatre ...| 86870|
|Prospect-Lefferts...| 84347|
|         Brownsville| 82764|
|Mott Haven/Port M...| 82396|
+--------------------+------+
only showing top 20 rows



In [13]:
spark.sql("SELECT zone_data.Zone, count(*) AS trips\
 FROM trip_data JOIN zone_data ON trip_data.PULocationID = zone_data.LocationID \
 WHERE trip_data.hvfhs_license_num = 'HV0003' \
 GROUP BY zone_data.Zone order by trips desc").explain(True)

== Parsed Logical Plan ==
'Sort ['trips DESC NULLS LAST], true
+- 'Aggregate ['zone_data.Zone], ['zone_data.Zone, 'count(1) AS trips#236]
   +- 'Filter ('trip_data.hvfhs_license_num = HV0003)
      +- 'Join Inner, ('trip_data.PULocationID = 'zone_data.LocationID)
         :- 'UnresolvedRelation [trip_data], [], false
         +- 'UnresolvedRelation [zone_data], [], false

== Analyzed Logical Plan ==
Zone: string, trips: bigint
Sort [trips#236L DESC NULLS LAST], true
+- Aggregate [Zone#48], [Zone#48, count(1) AS trips#236L]
   +- Filter (hvfhs_license_num#16 = HV0003)
      +- Join Inner, (PULocationID#20 = LocationID#46)
         :- SubqueryAlias trip_data
         :  +- View (`trip_data`, [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22])
         :     +- Relation [hvfhs_license_num#16,dispatching_base_num#17,pickup_datetime#18,dropoff_datetime#19,PULocationID#20,DOLocationID#21,SR_Flag#22] csv
         +-