In [0]:
%run ./Medallion-Approach

In [0]:
import time
import random
from pyspark.sql.functions import expr

In [0]:
class medallionApproachTestSuite():
  def __init__(self):
    self.base_data_dir = "/FileStore/test"
  
  def cleanTest(self):
    print("Starting cleanup ...")
    spark.sql("drop table if exists invoice_bz")
    spark.sql("drop table if exists invoice_line_items")

    dbutils.fs.rm("/user/hive/warehouse/invoice_bz", True)
    dbutils.fs.rm("/user/hive/warehouse/invoice_line_items", True)

    dbutils.fs.rm(f"{self.base_data_dir}/checkpoint/invoices_bz", True)
    dbutils.fs.rm(f"{self.base_data_dir}/checkpoint/invoices", True)

    dbutils.fs.rm(f"{self.base_data_dir}/data/invoices", True)
    dbutils.fs.rm(f"{self.base_data_dir}/data/invoices_archive", True)

    dbutils.fs.mkdirs(f"{self.base_data_dir}/data/invoices")
    print("Done.")

  def ingestData(self, itr):
    print(f"Starting ingestion of file # {itr}...", end='')
    dbutils.fs.cp(f"{self.base_data_dir}/invoices_{itr}.json", f"{self.base_data_dir}/data/invoices/")
    print("Done.\n")

  def assertResult(self, expected_count):
    actual_count = spark.sql("select count(*) from invoice_line_items").collect()[0][0]
    assert expected_count == actual_count, f"Test Failed! Actual count is {actual_count}"
  
  def waitForMicroBatch(self, sleep=30):
    print(f"Waiting for {sleep} seconds ...")
    time.sleep(sleep)
    print("Done.")
  
  def runTests(self):
    self.cleanTest()

    bzStream = Bronze()
    bzQuery = bzStream.process()

    slStream = Silver()
    slQuery = slStream.process()

    print("Testing first iteration of invoice stream ...")
    self.ingestData(1)
    self.waitForMicroBatch()
    self.assertResult(1253)
    print("First iteration of invoice stream completed.\n")

    print("Testing second iteration of invoice stream ...")
    self.ingestData(2)
    self.waitForMicroBatch()
    self.assertResult(2510)
    print("Third iteration of invoice stream completed.\n")

    print("Testing third iteration of invoice stream ...")
    self.ingestData(3)
    self.waitForMicroBatch()
    self.assertResult(3994)
    print("Third iteration of invoice stream completed.\n")

    bzQuery.stop()
    slQuery.stop()

    print("Validating Archive...", end="")
    archives_expected = ["invoices_1.json", "invoices_2.json"]
    for f in dbutils.fs.ls(f"{self.base_data_dir}/data/invoices_archive/{self.base_data_dir}/data/invoices"):
      assert f.name in archives_expected, f"Archive Validation failed for {f.name}"
    print("Done")

In [0]:
maTS = medallionApproachTestSuite()
maTS.runTests()









######################################################################################
Going to sleep for iteration # 0. Will be sleeping for 5 seconds.
######################################################################################


Starting cleanup ...
Done.
Starting Bronze Stream ...
Done

Starting Silver Stream ...
Done

Testing first iteration of invoice stream ...
Starting ingestion of file # 1...Done.

Waiting for 30 seconds ...
Done.
First iteration of invoice stream completed.

Testing second iteration of invoice stream ...
Starting ingestion of file # 2...Done.

Waiting for 30 seconds ...
Done.
Third iteration of invoice stream completed.

Testing third iteration of invoice stream ...
Starting ingestion of file # 3...Done.

Waiting for 30 seconds ...
Done.
Third iteration of invoice stream completed.

Validating Archive...Done








######################################################################################
Going to sleep for iteration # 1. Will b