In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

23/02/15 09:16:14 WARN Utils: Your hostname, GRAD0365UBUNTU resolves to a loopback address: 127.0.1.1; using 192.168.1.151 instead (on interface wlp0s20f3)
23/02/15 09:16:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/15 09:16:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## GroupBy
### Green taxi dataset

In [3]:
df_green = spark.read.parquet("../data/pq/green/*/*")
df_green.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [4]:
df_green.createOrReplaceTempView("green_data")

In [5]:
df_green_revenue = spark.sql("""
SELECT 
    date_trunc('hour', lpep_pickup_datetime) AS hour,
    PULocationID AS revenue_zone,
    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    green_data
WHERE lpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [6]:
df_green_revenue.show(5)



+-------------------+------------+-----------------+--------------+
|               hour|revenue_zone|           amount|number_records|
+-------------------+------------+-----------------+--------------+
|2020-01-24 09:00:00|          81|            59.49|             2|
|2020-01-04 21:00:00|          25|           513.83|            32|
|2020-01-10 19:00:00|          66|545.6800000000001|            27|
|2020-01-30 07:00:00|          75|556.6600000000001|            40|
|2020-01-18 01:00:00|         260|           144.56|            12|
+-------------------+------------+-----------------+--------------+
only showing top 5 rows



                                                                                

In [7]:
df_green_revenue \
    .repartition(20) \
    .write.parquet("../data/report/revenue/green", mode="overwrite")

[Stage 6:>                                                          (0 + 8) / 8]

23/02/15 09:16:22 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/02/15 09:16:22 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers




### Yellow taxi dataset

In [8]:
df_yellow = spark.read.parquet("../data/pq/yellow/*/*")
df_yellow.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [9]:
df_yellow.createOrReplaceTempView("yellow_data")

In [10]:
df_yellow_revenue = spark.sql("""
SELECT 
    date_trunc('hour', tpep_pickup_datetime) AS hour,
    PULocationID AS revenue_zone,
    SUM(total_amount) AS amount,
    COUNT(1) AS number_records
FROM
    yellow_data
WHERE tpep_pickup_datetime >= '2020-01-01 00:00:00'
GROUP BY
    1, 2
""")

In [11]:
df_yellow_revenue \
    .repartition(20) \
    .write.parquet("../data/report/revenue/yellow", mode="overwrite")



23/02/15 09:16:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
23/02/15 09:16:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


                                                                                

## Joins

### Joining two large tables

We now want to join the two tables we've just created by hour and by zone. The result will be a table with 6 columns.

In [12]:
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed("amount", "green_amount") \
    .withColumnRenamed("number_records", "green_number_records")

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed("amount", "yellow_amount") \
    .withColumnRenamed("number_records", "yellow_number_records")

In [13]:
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=["hour", "revenue_zone"], how="outer")

In [14]:
df_join

DataFrame[hour: timestamp, revenue_zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint]

In [15]:
df_join.show(5)



+-------------------+------------+------------------+--------------------+------------------+---------------------+
|               hour|revenue_zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|
+-------------------+------------+------------------+--------------------+------------------+---------------------+
|2020-01-01 00:00:00|          22|              15.8|                   1|              null|                 null|
|2020-01-01 00:00:00|          25|             531.0|                  26|            324.35|                   16|
|2020-01-01 00:00:00|          55|129.29000000000002|                   4|              null|                 null|
|2020-01-01 00:00:00|          56|             99.69|                   3|              18.1|                    2|
|2020-01-01 00:00:00|          60|            160.04|                   6|57.620000000000005|                    2|
+-------------------+------------+------------------+-------------------

                                                                                

In [16]:
df_join.write.parquet("../data/report/revenue/total", mode="overwrite")



23/02/15 09:16:38 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


                                                                                

In [17]:
df_join = spark.read.parquet("../data/report/revenue/total")
df_join.show(5)

+-------------------+------------+------------------+--------------------+------------------+---------------------+
|               hour|revenue_zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|
+-------------------+------------+------------------+--------------------+------------------+---------------------+
|2020-01-01 00:00:00|          17|            195.03|                   9|220.20999999999998|                    8|
|2020-01-01 00:00:00|          18|               7.8|                   1|               5.8|                    1|
|2020-01-01 00:00:00|          24|              87.6|                   3|            754.95|                   45|
|2020-01-01 00:00:00|          32| 68.94999999999999|                   2|              18.0|                    1|
|2020-01-01 00:00:00|          49|266.76000000000005|                  14|            185.65|                   10|
+-------------------+------------+------------------+-------------------

### Joining a large and a small table

We are going to use the `zones` lookup table to identify the `revenue_zone` from `df_join` table.

In [18]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv -O "../data/taxi_zone_lookup.csv"

df_zones = spark.read \
    .option("header", True) \
    .csv("../data/taxi_zone_lookup.csv")
df_zones.show(5)

--2023-02-15 09:16:39--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolviendo github.com (github.com)... 140.82.121.4
Conectando con github.com (github.com)[140.82.121.4]:443... conectado.
Petición HTTP enviada, esperando respuesta... 302 Found
Ubicación: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230215%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230215T081639Z&X-Amz-Expires=300&X-Amz-Signature=74348becc5239208e3ec3acb898a69f0e0b1fd9a9a989f702be0cce93b59dfc3&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [siguiente]
--2023-02-15 09:16:39--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-458

In [19]:
df_result = df_join.join(df_zones, on=df_join.revenue_zone == df_zones.LocationID)

df_result.drop("LocationID").show(10)

+-------------------+------------+------------------+--------------------+------------------+---------------------+---------+--------------------+------------+
|               hour|revenue_zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|  Borough|                Zone|service_zone|
+-------------------+------------+------------------+--------------------+------------------+---------------------+---------+--------------------+------------+
|2020-01-01 00:00:00|          17|            195.03|                   9|220.20999999999998|                    8| Brooklyn|             Bedford|   Boro Zone|
|2020-01-01 00:00:00|          18|               7.8|                   1|               5.8|                    1|    Bronx|        Bedford Park|   Boro Zone|
|2020-01-01 00:00:00|          24|              87.6|                   3|            754.95|                   45|Manhattan|        Bloomingdale| Yellow Zone|
|2020-01-01 00:00:00|          32| 68.94

In [21]:
df_result.drop("LocationID").write.parquet("../data/tmp/revenue-zones", mode="overwrite")

23/02/15 09:17:20 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers


                                                                                

In [22]:
df_result = spark.read.parquet("../data/tmp/revenue-zones")
df_result

DataFrame[hour: timestamp, revenue_zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint, Borough: string, Zone: string, service_zone: string]