In [1]:
import pyspark
from pyspark.sql import SparkSession, types

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/03 10:30:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_green = spark.read.parquet('./data/pq/green/*/*/')

                                                                                

In [7]:
df_green.registerTempTable('green')

In [17]:
df_result = spark.sql("""
    SELECT 
        DATE_TRUNC('hour', lpep_pickup_datetime) AS hour, 
        PULocationID AS revenue_zone,
        ROUND(SUM(total_amount), 2) AS amount,
        COUNT(*) AS number_records
    FROM green
    WHERE lpep_pickup_datetime >= '2020-01-01'
    GROUP BY
        hour,
        revenue_zone
    ORDER BY
        hour,
        revenue_zone
""")

In [18]:
df_result.show()

[Stage 12:>                                                         (0 + 4) / 4]

+-------------------+------------+-------+--------------+
|               hour|revenue_zone| amount|number_records|
+-------------------+------------+-------+--------------+
|2020-01-01 00:00:00|           7| 769.73|            45|
|2020-01-01 00:00:00|          17| 195.03|             9|
|2020-01-01 00:00:00|          18|    7.8|             1|
|2020-01-01 00:00:00|          22|   15.8|             1|
|2020-01-01 00:00:00|          24|   87.6|             3|
|2020-01-01 00:00:00|          25|  531.0|            26|
|2020-01-01 00:00:00|          29|   61.3|             1|
|2020-01-01 00:00:00|          32|  68.95|             2|
|2020-01-01 00:00:00|          33| 317.27|            11|
|2020-01-01 00:00:00|          35| 129.96|             5|
|2020-01-01 00:00:00|          36| 295.34|            11|
|2020-01-01 00:00:00|          37| 175.67|             6|
|2020-01-01 00:00:00|          38|  98.79|             2|
|2020-01-01 00:00:00|          40| 168.98|             8|
|2020-01-01 00

                                                                                

In [19]:
df_result.write.parquet('./data/report/revenue/green/')

                                                                                

What's happening above:
    - Partitions are sent to executors
    - executors first apply the filters we specify, then run transformations to provide intermediate results
    - intermediate results are *reshuffled* to executors, in this case based on hour and zone: these must end up in the same output partition 
    - if there's an `ORDER BY` command, another stage will order 

In [21]:
df_yellow = spark.read.parquet('./data/pq/yellow/*/*/')
df_yellow.registerTempTable('yellow')

df_result = spark.sql("""
    SELECT 
        DATE_TRUNC('hour', tpep_pickup_datetime) AS hour, 
        PULocationID AS revenue_zone,
        ROUND(SUM(total_amount), 2) AS amount,
        COUNT(*) AS number_records
    FROM yellow
    WHERE tpep_pickup_datetime >= '2020-01-01'
    GROUP BY
        hour,
        revenue_zone
    ORDER BY
        hour,
        revenue_zone
""")

df_result.write.parquet('./data/report/revenue/yellow/')

                                                                                

In [22]:
df_result.show()



+-------------------+------------+-------+--------------+
|               hour|revenue_zone| amount|number_records|
+-------------------+------------+-------+--------------+
|2020-01-01 00:00:00|           3|   25.0|             1|
|2020-01-01 00:00:00|           4| 1004.3|            57|
|2020-01-01 00:00:00|           7| 455.17|            38|
|2020-01-01 00:00:00|          10|  42.41|             2|
|2020-01-01 00:00:00|          12|  107.0|             6|
|2020-01-01 00:00:00|          13| 1214.8|            56|
|2020-01-01 00:00:00|          14|    8.8|             1|
|2020-01-01 00:00:00|          15|  34.09|             1|
|2020-01-01 00:00:00|          17| 220.21|             8|
|2020-01-01 00:00:00|          18|    5.8|             1|
|2020-01-01 00:00:00|          24| 754.95|            45|
|2020-01-01 00:00:00|          25| 324.35|            16|
|2020-01-01 00:00:00|          32|   18.0|             1|
|2020-01-01 00:00:00|          33| 255.56|             8|
|2020-01-01 00

                                                                                