In [0]:
from pyspark.sql.functions import explode, expr, split, trim, lower, col

In [0]:
class invoiceStreamBatch:
  def __init__(self):
    self.base_data_dir = "/FileStore/test"
  
  def getSchema(self):
    return '''InvoiceNumber string, CreatedTime bigint, StoreID string, PosID string, CashierID string,
            CustomerType string, CustomerCardNo string, TotalAmount double, NumberOfItems bigint, 
            PaymentMethod string, TaxableAmount double, CGST double, SGST double, CESS double, 
            DeliveryType string,
            DeliveryAddress struct<AddressLine string, City string, ContactNumber string, PinCode string, State string>,
            InvoiceLineItems array<struct<ItemCode string, ItemDescription string, ItemPrice double, ItemQty bigint, TotalValue double>>'''
  
  def readInvoices(self):
    return ( spark.readStream
            .format("json")
            .schema(self.getSchema())
            .option("inferSchema", True)
            .load(f"{self.base_data_dir}/data/invoices")
            )
    
  def explodeInvoices(self, invoiceDF):
    return ( invoiceDF.selectExpr("InvoiceNumber", "CreatedTime", "StoreID", "PosID", "CustomerType", "PaymentMethod", "DeliveryType", "DeliveryAddress.City", "DeliveryAddress.State","DeliveryAddress.PinCode", "explode(InvoiceLineItems) as LineItem")
            )
    
  def flattenInvoices(self, explodedDF):
    return ( explodedDF.withColumn("ItemCode", expr("LineItem.ItemCode"))
            .withColumn("ItemDescription", expr("LineItem.ItemDescription"))
            .withColumn("ItemPrice", expr("LineItem.ItemPrice"))
            .withColumn("ItemQty", expr("LineItem.ItemQty"))
            .withColumn("TotalValue", expr("LineItem.TotalValue"))
            .drop("LineItem")
          )
    
  def appendInvoices(self, flattenedInvoices, trigger="batch"):
    sQuery = ( flattenedInvoices.writeStream
            .format("delta")
            .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/invoices")
            .outputMode("append")
            .option("maxFilesPerTrigger", 1)
          )
    if trigger == "batch":
      return (sQuery.trigger(availableNow = True)
               .toTable("invoice_line_items")
               )
    else:
      return (sQuery.trigger(processingTime = trigger)
               .toTable("invoice_line_items")
               )
    
  def process(self, trigger='batch'):
    print("Starting Invoice processing stream ...")
    invoicesDF = self.readInvoices()
    explodedDF = self.explodeInvoices(invoicesDF)
    resultDF = self.flattenInvoices(explodedDF)
    sQuery = self.appendInvoices(resultDF, trigger)
    print("Done\n")
    return sQuery
