# Structured Streaming Exercise

In order to perform the exercises included in this Notebook, it is neccesary to create several csv files iteratively in the data folder (`data/streaming`). For doing so, open a terminal using the Jupyter console and place the working directory in `ex3-structured-streaming`. After that, type `python generate_data.py`.

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

## 1. Quick Example

In [None]:
# Create SparkSession

spark = SparkSession.builder.appName("structured-streaming")\
.master("local[*]").getOrCreate()

In [None]:
# Create Schema

schema = T.StructType([T.StructField("word", T.StringType(), True),
                      T.StructField("timestamp", T.TimestampType(), True)])

In [None]:
# Read data from folder

csvDF = (spark.readStream
         .option("sep", ",")
         .option("maxFilesPerTrigger", "1")
         .schema(schema)
         .csv("../data/streaming"))

In [None]:
# Generate running word count

wordCounts = (csvDF
              .withWatermark("timestamp", "10 minutes")
              .groupBy(F.window(F.col("timestamp"), "3 minutes", "1 minutes"),
                       "word")
              .count()
              .orderBy(F.desc("window"), F.asc("word")))

In [None]:
# Generate and start query

query = (wordCounts
         .writeStream
         .outputMode("complete")
         .format("console")
         .option("truncate", "false")
         .start())

query.awaitTermination()

In [None]:
# Generate running word count

wordCounts = (csvDF
              .withWatermark("timestamp", "10 minutes")
              .groupBy(F.window(F.col("timestamp"), "3 minutes", "1 minutes"),
                       "word")
              .count())

In [None]:
# Generate and start query

query = (wordCounts
         .writeStream
         .outputMode("update")
         .format("console")
         .option("truncate", "false")
         .start())

query.awaitTermination()

In [None]:
# Simple select query

query = (csvDF
         .select("word")
         .writeStream
         .outputMode("append")
         .format("console")
         .option("truncate", "false")
         .start())

query.awaitTermination()