In [0]:
from pyspark.sql.functions import explode, split, trim, lower, col

In [0]:
class batchWC():
  def __init__(self):
    self.base_data_dir = "/FileStore/test"

  def getRawData(self):
    lines = (spark.read
         .format("text")
         .option("linesep", ".")
         .load(f"{self.base_data_dir}/data/text")
        )
    return lines.select(explode(split(lines.value, " ")).alias("word"))
  
  def getQualityData(self, rawDF):
    return (rawDF
            .select(lower(trim(col("word"))).alias("word"))
            .where("word is not null")
            .where("word rlike '[a-z]'")
          )
  
  def getWordCount(self, qualityDF):
    return qualityDF.groupby("word").count()
  
  def overwriteWordCount(self, wordCountDF):
    (wordCountDF.write
            .format("delta")
            .mode("overwrite")
            .saveAsTable("word_count_table")
          )
  
  def wordCount(self):
    print(f"Executing Word Count...")
    rawDF = self.getRawData()
    qualityDF = self.getQualityData(rawDF)
    resultDF = self.getWordCount(qualityDF)
    self.overwriteWordCount(resultDF)
    print ("Done")

In [0]:
class streamWC():
  def __init__(self):
    self.base_data_dir = "/FileStore/test"

  def getRawData(self):
    lines = (spark.readStream
         .format("text")
         .option("linesep", ".")
         .load(f"{self.base_data_dir}/data/text")
        )
    return lines.select(explode(split(lines.value, " ")).alias("word"))
  
  def getQualityData(self, rawDF):
    return (rawDF
            .select(lower(trim(col("word"))).alias("word"))
            .where("word is not null")
            .where("word rlike '[a-z]'")
          )
  
  def getWordCount(self, qualityDF):
    return qualityDF.groupby("word").count()
  
  def overwriteWordCount(self, wordCountDF):
    return (wordCountDF.writeStream
            .format("delta")
            .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/word_count")
            .outputMode("complete")
            .toTable("word_count_table")
          )
  
  def wordCount(self):
    print(f"Starting Word Count Stream ...")
    rawDF = self.getRawData()
    qualityDF = self.getQualityData(rawDF)
    resultDF = self.getWordCount(qualityDF)
    sQuery = self.overwriteWordCount(resultDF)
    print ("Done")
    return sQuery