# Structured Streaming

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession
        .builder
        .appName("streamEX")
        .getOrCreate())

#### Step 1: Define Input Sources

In [3]:
### Genberates lines DF as an unbounted table of newline-separated text data, read from localhost:9999
lines = (spark
            .readStream.format("socket")
            .option("host", "localhost")
            .option("port", 9999)
            .load())

#### Step 2: Transform Data
Stateless Transformations: select(), filter(), map(), etc. Do not require info from previous rows to process next row.
<br/>
Stateful Transformations: count(), etc. Requires maintaining state to combine data across multiple rows. Anything involving grouping, joining, aggregating, etc are stateful.

In [4]:
### Now we can apply some typical DF operations. This splits the lines into individual words and then counts them.
words = lines.select(split(col("value"), "\\s").alias("word"))
counts = words.groupBy("word").count()

#### Step 3: Define Output Sink & Output Mode
Will output to the console
<br/>
Then have choice of three output modes (how we will process the streaming data): <br/>
Append: Default Mode. Only new rows are added to the result table/DF <br/>
Complete: All rows are output at the end of every trigger <br/>
Update: Only rows that were updated since the last trigger will be output
    

In [5]:
writer = counts.writeStream.format("console").outputMode("complete")

#### Step 4: Specify Processing Details
Indicate when to trigger the discovery and processing of newly available streaming data. Four options: <br/>
Default: No trigger explicitly specified. Streaming query executes in subsequent micro-batches. <br/>
Interval: Specify 'processingTime' (below) <br/>
Once: Executes only once <br/>
Continuous: Experimental mode - data processed continuously instead of micro-batches.


In [7]:
checkpointDir = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/stream08/"

In [8]:
writer2 = (writer
              .trigger(processingTime="1 second")
              .option("checkpointLocation", checkpointDir))


#### Step 5: Start the query

In [9]:
streamingQuery = writer2.start()

In [10]:
streamingQuery.stop()

In [11]:
spark.stop()