In [0]:
%run ./Streaming-Batch

In [0]:
import time
from pyspark.sql.functions import expr

In [0]:
class invoiceStreamBatchTestSuite:
  
  def __init__(self):
    self.base_data_dir = "/FileStore/test"
  
  def cleanTest(self):
    print("Starting cleanup ...")
    # Drop delta table and hive tables.
    spark.sql("drop table if exists invoice_line_items")
    dbutils.fs.rm("/user/hive/warehouse/invoice_line_items", True)
    # delete raw/staging data and checkpoint directory
    dbutils.fs.rm(f"{self.base_data_dir}/checkpoint/invoices", True)
    dbutils.fs.rm(f"{self.base_data_dir}/data/invoices", True)
    # create raw/staging directory.
    dbutils.fs.mkdirs(f"{self.base_data_dir}/data/invoices")
    print("Done.")
  
  def ingestData(self, itr):
    print(f"Starting ingestion of file #{itr}...")
    dbutils.fs.cp(f"{self.base_data_dir}/invoices_{itr}.json", f"{self.base_data_dir}/data/invoices/")
    print("Done.\n")
  
  def assertResult(self, expected_count):
    print("Starting validation ...")
    #print(f"Testing first iteration of invoice stream ...")
    actual_count = spark.sql("select count(*) from invoice_line_items").collect()[0][0]
    assert actual_count == expected_count, f"Test Failed! Actual count is {actual_count}"
    print("Done.")
    #print("Successfully tested first iteration of invoice stream.\n")

  def waitForMicroBatch(self, sleep_time=40):
    print(f"Sleeping for {sleep_time} seconds ...")
    time.sleep(sleep_time)
    print(f"Woke up from {sleep_time} seconds sleep.")
  
  def runStreamTest(self):
    self.cleanTest()

    iStream = invoiceStreamBatch()
    streamQuery = iStream.process("30 seconds")

    print("Testing first iteration of invoice stream ...")
    self.ingestData(1)
    self.waitForMicroBatch()
    self.assertResult(1253)
    print("First iteration of invoice stream completed.\n")

    print("Testing second iteration of invoice stream ...")
    self.ingestData(2)
    self.waitForMicroBatch()
    self.assertResult(2510)
    print("Third iteration of invoice stream completed.\n")

    print("Testing third iteration of invoice stream ...")
    self.ingestData(3)
    self.waitForMicroBatch()
    self.assertResult(3994)
    print("Third iteration of invoice stream completed.\n")

    streamQuery.stop()

  def runBatchTest(self):
    self.cleanTest()

    iStream = invoiceStreamBatch()
    #streamQuery = iStream.process("30 seconds")
    # In batch mode, if it didn't find any data, streming handle will stop immediately.

    print("Testing first batch of invoice stream ...")
    self.ingestData(1)
    self.ingestData(2)
    streamQuery = iStream.process("batch")  #no need of stopping it as it will stop automatically.
    self.waitForMicroBatch()
    self.assertResult(2510)
    print("First batch of invoice stream completed.\n")

    print("Testing second batch of invoice stream ...")
    self.ingestData(3)
    streamQuery = iStream.process("batch")
    self.waitForMicroBatch()
    self.assertResult(3994)
    print("Second batch of invoice stream completed.\n")



In [0]:
isTS = invoiceStreamBatchTestSuite()
isTS.runStreamTest()

Starting cleanup ...
Done.
Starting Invoice processing stream ...
Done

Testing first iteration of invoice stream ...
Starting ingestion of file #1...
Done.

Sleeping for 40 seconds ...
Woke up from 40 seconds sleep.
Starting validation ...
Done.
First iteration of invoice stream completed.

Testing second iteration of invoice stream ...
Starting ingestion of file #2...
Done.

Sleeping for 40 seconds ...
Woke up from 40 seconds sleep.
Starting validation ...
Done.
Third iteration of invoice stream completed.

Testing third iteration of invoice stream ...
Starting ingestion of file #3...
Done.

Sleeping for 40 seconds ...
Woke up from 40 seconds sleep.
Starting validation ...
Done.
Third iteration of invoice stream completed.



In [0]:
isTS.runBatchTest()

Starting cleanup ...
Done.
Testing first batch of invoice stream ...
Starting ingestion of file #1...
Done.

Starting ingestion of file #2...
Done.

Starting Invoice processing stream ...
Done

Sleeping for 40 seconds ...
Woke up from 40 seconds sleep.
Starting validation ...
Done.
First batch of invoice stream completed.

Testing second batch of invoice stream ...
Starting ingestion of file #3...
Done.

Starting Invoice processing stream ...
Done

Sleeping for 40 seconds ...
Woke up from 40 seconds sleep.
Starting validation ...
Done.
Second batch of invoice stream completed.

