In [1]:
import pyspark
from pyspark.sql import SparkSession, types

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/03 19:51:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df_green = spark.read.parquet('./data/pq/green/*/*/')

                                                                                

In [3]:
df_green.registerTempTable('green')



In [4]:
df_result = spark.sql("""
    SELECT 
        DATE_TRUNC('hour', lpep_pickup_datetime) AS hour, 
        PULocationID AS revenue_zone,
        ROUND(SUM(total_amount), 2) AS amount,
        COUNT(*) AS number_records
    FROM green
    WHERE lpep_pickup_datetime >= '2020-01-01'
    GROUP BY
        hour,
        revenue_zone
    ORDER BY
        hour,
        revenue_zone
""")

In [5]:
df_result.show()

[Stage 3:>                                                          (0 + 4) / 4]

+-------------------+------------+-------+--------------+
|               hour|revenue_zone| amount|number_records|
+-------------------+------------+-------+--------------+
|2020-01-01 00:00:00|           7| 769.73|            45|
|2020-01-01 00:00:00|          17| 195.03|             9|
|2020-01-01 00:00:00|          18|    7.8|             1|
|2020-01-01 00:00:00|          22|   15.8|             1|
|2020-01-01 00:00:00|          24|   87.6|             3|
|2020-01-01 00:00:00|          25|  531.0|            26|
|2020-01-01 00:00:00|          29|   61.3|             1|
|2020-01-01 00:00:00|          32|  68.95|             2|
|2020-01-01 00:00:00|          33| 317.27|            11|
|2020-01-01 00:00:00|          35| 129.96|             5|
|2020-01-01 00:00:00|          36| 295.34|            11|
|2020-01-01 00:00:00|          37| 175.67|             6|
|2020-01-01 00:00:00|          38|  98.79|             2|
|2020-01-01 00:00:00|          40| 168.98|             8|
|2020-01-01 00

                                                                                

In [19]:
df_result.write.parquet('./data/report/revenue/green/')

                                                                                

What's happening above:
    - Partitions are sent to executors
    - executors first apply the filters we specify, then run transformations to provide intermediate results
    - intermediate results are *reshuffled* to executors, in this case based on hour and zone: these must end up in the same output partition 
    - if there's an `ORDER BY` command, another stage will order 

In [21]:
df_yellow = spark.read.parquet('./data/pq/yellow/*/*/')
df_yellow.registerTempTable('yellow')

df_result = spark.sql("""
    SELECT 
        DATE_TRUNC('hour', tpep_pickup_datetime) AS hour, 
        PULocationID AS revenue_zone,
        ROUND(SUM(total_amount), 2) AS amount,
        COUNT(*) AS number_records
    FROM yellow
    WHERE tpep_pickup_datetime >= '2020-01-01'
    GROUP BY
        hour,
        revenue_zone
    ORDER BY
        hour,
        revenue_zone
""")

df_result.write.parquet('./data/report/revenue/yellow/')

                                                                                

In [22]:
df_result.show()



+-------------------+------------+-------+--------------+
|               hour|revenue_zone| amount|number_records|
+-------------------+------------+-------+--------------+
|2020-01-01 00:00:00|           3|   25.0|             1|
|2020-01-01 00:00:00|           4| 1004.3|            57|
|2020-01-01 00:00:00|           7| 455.17|            38|
|2020-01-01 00:00:00|          10|  42.41|             2|
|2020-01-01 00:00:00|          12|  107.0|             6|
|2020-01-01 00:00:00|          13| 1214.8|            56|
|2020-01-01 00:00:00|          14|    8.8|             1|
|2020-01-01 00:00:00|          15|  34.09|             1|
|2020-01-01 00:00:00|          17| 220.21|             8|
|2020-01-01 00:00:00|          18|    5.8|             1|
|2020-01-01 00:00:00|          24| 754.95|            45|
|2020-01-01 00:00:00|          25| 324.35|            16|
|2020-01-01 00:00:00|          32|   18.0|             1|
|2020-01-01 00:00:00|          33| 255.56|             8|
|2020-01-01 00

                                                                                

# 5.4.3 - Joins in Spark

Want to join the two above tables into one, with a schema somewhat like:
- Hour
- Zone
- Revenue yellow
- Number trips yellow
- Revenue green
- Number trips green

Essentially, we're joining the two tables on hour and zone. We'll need to rename at least one of the sets to include identifying language, as below.

In [25]:
df_green_revenue = spark.read.parquet('./data/report/revenue/green/*')
df_yellow_revenue = spark.read.parquet('./data/report/revenue/yellow/*')

In [26]:
df_green_revenue = df_green_revenue \
    .withColumnRenamed('amount', 'amount_green',) \
    .withColumnRenamed('number_records', 'records_green')

df_yellow_revenue = df_yellow_revenue \
    .withColumnRenamed('amount', 'amount_yellow',) \
    .withColumnRenamed('number_records', 'records_yellow')

In [27]:
df_join = df_green_revenue \
    .join(df_yellow_revenue, on=['hour', 'revenue_zone'], how='outer')

In [28]:
df_join.show(10)



+-------------------+------------+------------+-------------+-------------+--------------+
|               hour|revenue_zone|amount_green|records_green|amount_yellow|records_yellow|
+-------------------+------------+------------+-------------+-------------+--------------+
|2020-01-01 00:00:00|           3|        null|         null|         25.0|             1|
|2020-01-01 00:00:00|           4|        null|         null|       1004.3|            57|
|2020-01-01 00:00:00|           7|      769.73|           45|       455.17|            38|
|2020-01-01 00:00:00|          12|        null|         null|        107.0|             6|
|2020-01-01 00:00:00|          37|      175.67|            6|       161.61|             7|
|2020-01-01 00:00:00|          40|      168.98|            8|        89.97|             5|
|2020-01-01 00:00:00|          45|        null|         null|       732.48|            42|
|2020-01-01 00:00:00|          47|        13.3|            1|          8.3|             1|

                                                                                

In [29]:
df_join.write.parquet('./data/report/revenue/total')

                                                                                

In [30]:
!ls -lh ./data/report/revenue/total

total 13M
-rw-r--r-- 1 sam sam    0 Mar  3 20:10 _SUCCESS
-rw-r--r-- 1 sam sam 3.1M Mar  3 20:10 part-00000-701b7abe-0120-4a41-af57-66e2c79f8fd0-c000.snappy.parquet
-rw-r--r-- 1 sam sam 3.1M Mar  3 20:10 part-00001-701b7abe-0120-4a41-af57-66e2c79f8fd0-c000.snappy.parquet
-rw-r--r-- 1 sam sam 3.2M Mar  3 20:10 part-00002-701b7abe-0120-4a41-af57-66e2c79f8fd0-c000.snappy.parquet
-rw-r--r-- 1 sam sam 3.3M Mar  3 20:10 part-00003-701b7abe-0120-4a41-af57-66e2c79f8fd0-c000.snappy.parquet


Let's say we want to include the names of the zones:

In [36]:
df_zones = spark.read.csv('./data/raw/taxi_zone_lookup.csv', header=True)
df_zones.show(10)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 10 rows



In [47]:
df_join.show(10)

[Stage 63:>                                                         (0 + 4) / 4]

+-------------------+------------+------------+-------------+-------------+--------------+
|               hour|revenue_zone|amount_green|records_green|amount_yellow|records_yellow|
+-------------------+------------+------------+-------------+-------------+--------------+
|2020-01-01 00:00:00|           3|        null|         null|         25.0|             1|
|2020-01-01 00:00:00|           4|        null|         null|       1004.3|            57|
|2020-01-01 00:00:00|           7|      769.73|           45|       455.17|            38|
|2020-01-01 00:00:00|          12|        null|         null|        107.0|             6|
|2020-01-01 00:00:00|          37|      175.67|            6|       161.61|             7|
|2020-01-01 00:00:00|          40|      168.98|            8|        89.97|             5|
|2020-01-01 00:00:00|          45|        null|         null|       732.48|            42|
|2020-01-01 00:00:00|          47|        13.3|            1|          8.3|             1|

                                                                                

In [50]:
df_zones.show(1)

+----------+-------+--------------+------------+
|LocationID|Borough|          Zone|service_zone|
+----------+-------+--------------+------------+
|         1|    EWR|Newark Airport|         EWR|
+----------+-------+--------------+------------+
only showing top 1 row



In [51]:
df_result = df_join \
    .join(df_zones, df_join.revenue_zone == df_zones.LocationID, how='inner')
df_result.show(10)

[Stage 90:>                                                         (0 + 4) / 4]

+-------------------+------------+------------+-------------+-------------+--------------+----------+---------+--------------------+------------+
|               hour|revenue_zone|amount_green|records_green|amount_yellow|records_yellow|LocationID|  Borough|                Zone|service_zone|
+-------------------+------------+------------+-------------+-------------+--------------+----------+---------+--------------------+------------+
|2020-01-01 00:00:00|           3|        null|         null|         25.0|             1|         3|    Bronx|Allerton/Pelham G...|   Boro Zone|
|2020-01-01 00:00:00|           4|        null|         null|       1004.3|            57|         4|Manhattan|       Alphabet City| Yellow Zone|
|2020-01-01 00:00:00|           7|      769.73|           45|       455.17|            38|         7|   Queens|             Astoria|   Boro Zone|
|2020-01-01 00:00:00|          12|        null|         null|        107.0|             6|        12|Manhattan|        Batte

                                                                                

In [54]:
df_result \
    .select('hour', 'Zone', \
            'amount_green', 'records_green', \
            'amount_yellow', 'records_yellow') \
    .show(10)

+-------------------+--------------------+------------+-------------+-------------+--------------+
|               hour|                Zone|amount_green|records_green|amount_yellow|records_yellow|
+-------------------+--------------------+------------+-------------+-------------+--------------+
|2020-01-01 00:00:00|Allerton/Pelham G...|        null|         null|         25.0|             1|
|2020-01-01 00:00:00|       Alphabet City|        null|         null|       1004.3|            57|
|2020-01-01 00:00:00|             Astoria|      769.73|           45|       455.17|            38|
|2020-01-01 00:00:00|        Battery Park|        null|         null|        107.0|             6|
|2020-01-01 00:00:00|      Bushwick South|      175.67|            6|       161.61|             7|
|2020-01-01 00:00:00|     Carroll Gardens|      168.98|            8|        89.97|             5|
|2020-01-01 00:00:00|           Chinatown|        null|         null|       732.48|            42|
|2020-01-0