In [10]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType, IntegerType, FloatType, BooleanType
from time import sleep
import pyspark.sql.functions as F
import datetime as dt


sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkStreaming pipeline 2")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp_snellejassie"
spark.conf.set('temporaryGcsBucket', bucket)

dataSchema = StructType(
        [StructField("a", IntegerType(), True),
         StructField("b", IntegerType(), True),
         StructField("day", IntegerType(), True),
         StructField("month", IntegerType(), True),
         StructField("year", IntegerType(), True),
         StructField("demand", FloatType(), True),
         StructField("RRP", FloatType(), True),
         StructField("demand_pos_RRP", FloatType(), True),
         StructField("RRP_positive", FloatType(), True),
         StructField("demand_neg_RRP", FloatType(), True),
         StructField("RRP_negative", FloatType(), True),
         StructField("frac_at_neg_RRP", FloatType(), True),
         StructField("min_temperature", FloatType(), True),
         StructField("max_temperature", FloatType(), True),
         StructField("solar_exposure", FloatType(), True),
         StructField("rainfall", FloatType(), True),
         StructField("school_day", BooleanType(), True),
         StructField("holiday", BooleanType(), True)
         ])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
        .csv("/home/jovyan/data/activity-data/activity-data-p2")
print(sdf)
# .option("maxBytesPerTrigger", "65536b")
# Do a calculation
activityCounts = sdf.groupBy("a", "b", F.col('day'), F.col('month'), F.col('year'),
                            F.col('demand'), F.col('RRP'), F.col('demand_pos_RRP'), F.col('RRP_positive'),
                            F.col('demand_neg_RRP'), F.col('RRP_negative'), F.col("frac_at_neg_RRP"),
                            F.col('min_temperature'), F.col('max_temperature'), F.col("solar_exposure"),  
                            F.col('rainfall'), F.col('school_day'), F.col("holiday")).count()


def my_foreach_batch_function(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format('bigquery') \
      .option('table', 'de2022-assignment2-fresh.pipelines.tablepipeline2') \
      .mode("overwrite") \
      .save()

# Write to a sink - here, the output is written to a Big Query Table
# Use your gcp bucket name. 
# ProcessingTime trigger with two-seconds micro-batch interval
activityQuery = activityCounts.writeStream.outputMode("complete") \
                    .trigger(processingTime = '2 seconds').foreachBatch(my_foreach_batch_function).start()
try:
    activityQuery.awaitTermination()
except KeyboardInterrupt:
    activityQuery.stop()
    # Stop the spark context
    spark.stop()
    print("Stopped the streaming query and the spark context")

DataFrame[a: int, b: int, day: int, month: int, year: int, demand: float, RRP: float, demand_pos_RRP: float, RRP_positive: float, demand_neg_RRP: float, RRP_negative: float, frac_at_neg_RRP: float, min_temperature: float, max_temperature: float, solar_exposure: float, rainfall: float, school_day: boolean, holiday: boolean]


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=56>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline(

Py4JError: An error occurred while calling o559.awaitTermination

In [11]:
# Stop the spark context
spark.stop()
