# Day 10 Part 1 Streaming+basics

In [None]:
from pyspark.sql.types import StructType, StructField, StringType , IntegerType, FloatType

schema = StructType([
                     StructField('Country',StringType()),
                     StructField('Citizens',IntegerType())
])

In [None]:
source_dir = 'dbfs:/FileStore/streaming/'

In [None]:
%sql

CREATE SCHEMA IF NOT EXISTS  stream;
use stream

In [None]:
# spark.readStream Returns a DataStreamReader that can be used to read data streams as a streaming DataFrame.
df = spark.readStream.format("csv")\
        .option('header','true')\
        .schema(schema)\
        .load(source_dir)

In [None]:
display(df)

Country,Citizens
India,10
USA,5
China,10
India,10
Canada,40
Brazil,10
India,5
USA,10
China,5
India,5


In [None]:
# Complete code Resource: https://spark.apache.org/docs/3.5.3/structured-streaming-programming-guide.html
 WriteStream = ( df.writeStream
        .option('checkpointLocation',f'{source_dir}/AppendCheckpoint')
        .outputMode("append")
        .queryName('AppendQuery')
        .toTable("stream.AppendTable"))

In [None]:
%sql
SELECT * FROM stream.AppendTable

Country,Citizens
India,10
USA,5
China,10
India,10
Canada,40
Brazil,10
India,5
USA,10
China,5
India,5


In [None]:
WriteStream.stop()

## Day 10 Part 2 outputModes

In [None]:
dbutils.fs.rm('dbfs:/user/hive/warehouse/stream.db',True)
dbutils.fs.rm('dbfs:/FileStore/streaming',True)

In [None]:
dbutils.fs.rm('dbfs:/FileStore/streaming/CompleteCheckpoint',True)

In [None]:
%sql
DROP DATABASE IF EXISTS stream CASCADE;
CREATE DATABASE IF NOT EXISTS stream

In [None]:
from pyspark.sql.types import StructType, StructField, StringType , IntegerType, FloatType

schema = StructType([
                     StructField('Country',StringType()),
                     StructField('Citizens',IntegerType())
])

In [None]:
source_dir = 'dbfs:/FileStore/streaming/'

In [None]:
df = spark.readStream.format("csv")\
        .option('header','true')\
        .schema(schema)\
        .load(source_dir)

### Append

In [None]:
 WriteStream = ( df.writeStream
        .option('checkpointLocation',f'{source_dir}/AppendCheckpoint')
        .outputMode("append")
        .queryName('AppendQuery')
        .toTable("stream.AppendTable"))

In [None]:
%sql
SELECT * FROM stream.AppendTable

## Complete

In [None]:
from pyspark.sql.functions import sum
df_complete = df.groupBy('Country').agg(sum('Citizens').alias('Total_Population'))

In [None]:
 WriteCompleteStream = ( df_complete.writeStream
        .option('checkpointLocation',f'{source_dir}/CompleteCheckpoint')
        .outputMode("complete")
        .queryName('CompleteQuery')
        .toTable("stream.CompleteTable"))

In [None]:
%sql
SELECT * FROM stream.CompleteTable

## Day 10 Part 3 Triggers

#### Reading the streaming dataframe

In [None]:
df = spark.readStream.format("csv")\
        .option('header','true')\
        .schema(schema)\
        .load(source_dir)


#### 01. Trigger - default or unspecifed Trigger

In [None]:
 WriteStream = ( df.writeStream
        .option('checkpointLocation',f'{source_dir}/AppendCheckpoint')
        .outputMode("append")
        .queryName('DefaultTrigger')
        .toTable("stream.AppendTable"))


#### 02. Trigger - processingTime

In [None]:
 WriteStream = ( df.writeStream
        .option('checkpointLocation',f'{source_dir}/AppendCheckpoint')
        .outputMode("append")
        .trigger(processingTime='2 minutes')
        .queryName('ProcessingTime')
        .toTable("stream.AppendTable"))


#### 03. Trigger - availablenow

In [None]:
 WriteStream = ( df.writeStream
        .option('checkpointLocation',f'{source_dir}/AppendCheckpoint')
        .outputMode("append")
        .trigger(availableNow=True)
        .queryName('AvailableNow')
        .toTable("stream.AppendTable"))

In [None]:
%sql
SELECT * FROM stream.AppendTable

In [None]:
%sql
drop table stream.AppendTable