## Group by

In [6]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

In [4]:
spark

In [26]:
# spark green dataset + temp view
local_path = 'data/pq/green/*/*'
df_green = spark.read.parquet(local_path)
df_green.createOrReplaceTempView('green')

In [27]:
df_green_revenue = spark.sql("""
    SELECT 
        date_trunc('hour', lpep_pickup_datetime) AS hour, 
        PULocationID AS zone,
    
        SUM(total_amount) AS amount,
        COUNT(1) AS number_records
    FROM
        green
    WHERE
        lpep_pickup_datetime >= '2020-01-01 00:00:00'
    GROUP BY
        1, 2
""")

In [28]:
# Green Results to parquet file
output_path = 'data/report/revenue/green'
df_green_revenue \
    .repartition(20) \
    .write.parquet(output_path, mode='overwrite')

                                                                                

In [30]:
# spark yellow dataset + temp view
local_path = 'data/pq/yellow/*/*'
df_yellow = spark.read.parquet(local_path)
df_yellow.createOrReplaceTempView('yellow')

In [31]:
df_yellow_revenue = spark.sql("""
    SELECT 
        date_trunc('hour', tpep_pickup_datetime) AS hour, 
        PULocationID AS zone,
    
        SUM(total_amount) AS amount,
        COUNT(1) AS number_records
    FROM
        yellow
    WHERE
        tpep_pickup_datetime >= '2020-01-01 00:00:00'
    GROUP BY
        1, 2
""")

In [33]:
# Green Results to parquet file
output_path = 'data/report/revenue/yellow'
df_yellow_revenue \
    .repartition(20) \
    .write.parquet(output_path, mode='overwrite')

                                                                                

## Joins

In [41]:
# spark dataframes 
g_file_path = 'data/report/revenue/green'
y_file_path = 'data/report/revenue/yellow'
df_green_revenue = spark.read.parquet(g_file_path)
df_yellow_revenue = spark.read.parquet(y_file_path)

In [42]:
# rename columns
df_green_revenue_tmp = df_green_revenue \
    .withColumnRenamed('amount','green_amount') \
    .withColumnRenamed('number_records','green_number_records')

df_yellow_revenue_tmp = df_yellow_revenue \
    .withColumnRenamed('amount','yellow_amount') \
    .withColumnRenamed('number_records','yellow_number_records')

In [43]:
# Spark join
df_join = df_green_revenue_tmp.join(other=df_yellow_revenue_tmp, on=['hour','zone'], how='outer')

In [44]:
# Save results as parquet file
output_path = 'data/report/revenue/total'
df_join.write.parquet(output_path, mode='overwrite')

                                                                                

In [47]:
zones = spark.read.parquet('zones')

In [50]:
# Join on zones
df_result = df_join.join(zones, [df_join.zone == zones.LocationID], 'left')

In [52]:
df_result.columns

['hour',
 'zone',
 'green_amount',
 'green_number_records',
 'yellow_amount',
 'yellow_number_records',
 'LocationID',
 'Borough',
 'Zone',
 'service_zone']

In [None]:
# Save results to parquet file (excluding columns in drop() )
df_result \
    .drop('zone', 'LocationID') \
    .write.parquet('tmp/revenue-zones')