In [12]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [13]:
credentials_location = "/home/salacjamesrhode23/creds/my-creds.json"

conf = SparkConf() \
    .setMaster("local[*]") \
    .setAppName('test') \
    .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("google.cloud.auth.service.account.enable", "true") \
    .set("google.cloud.auth.service.account.json.keyfile", credentials_location)

In [14]:
sc = SparkContext(conf=conf)

25/10/14 10:47:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [15]:


hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [16]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [17]:
df_green = spark.read.parquet('gs://de-zoomcamp-ny-taxi-data-lake/pq/green/*/*')
df_yellow = spark.read.parquet('gs://de-zoomcamp-ny-taxi-data-lake/pq/yellow/*/*')

                                                                                

In [18]:
df_green.show()

                                                                                

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2022-12-26 15:29:01|  2022-12-26 15:44:43|            2.0|         1.36|       1.0|                 N|          48|         234|           1|       14.9|  0.0|    0.5|      3.7

25/10/14 11:19:20 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/10/14 11:19:21 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
df_green.printSchema()

In [None]:
df_yellow.printSchema()

In [None]:
common_columns = []

yellow_columns = set(df_yellow.columns)

for col in df_green.columns:
    if col in yellow_columns:
        common_columns.append(col)

In [None]:
from pyspark.sql import functions as F

In [None]:
df_green_sel = df_green \
    .select(common_columns) \
    .withColumn('service_type', F.lit('green'))

In [None]:
df_yellow_sel = df_yellow \
    .select(common_columns) \
    .withColumn('service_type', F.lit('yellow'))

In [None]:
df_trips_data = df_yellow_sel.unionAll(df_green_sel)

In [None]:
df_trips_data = df_trips_data.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
                            .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")

In [None]:
df_trips_data \
    .groupBy('service_type') \
    .count().show()

### SparkSQL

In [None]:
df_trips_data.columns

In [None]:
df_trips_data.createOrReplaceTempView("trips_data")

In [None]:
df_results = spark.sql("""
SELECT 
    -- Revenue grouping 
    PULocationID AS revenue_zone,
    date_trunc('month', pickup_datetime) AS revenue_month, 
    service_type, 

    -- Revenue calculation 
    SUM(fare_amount) AS revenue_monthly_fare,
    SUM(extra) AS revenue_monthly_extra,
    SUM(mta_tax) AS revenue_monthly_mta_tax,
    SUM(tip_amount) AS revenue_monthly_tip_amount,
    SUM(tolls_amount) AS revenue_monthly_tolls_amount,
    SUM(improvement_surcharge) AS revenue_monthly_improvement_surcharge,
    SUM(total_amount) AS revenue_monthly_total_amount,
    SUM(congestion_surcharge) AS revenue_monthly_congestion_surcharge,

    -- Additional calculations
    AVG(passenger_count) AS avg_monthly_passenger_count,
    AVG(trip_distance) AS avg_monthly_trip_distance
FROM
    trips_data
GROUP BY
    1, 2, 3
""")

In [None]:
df_results.coalesce(1).write.parquet('data/report/revenue', mode='overwrite')