In [None]:
from pyspark.sql import SparkSession
# Spark session & context
spark = (SparkSession
         .builder
         .master("local[2]")
         .appName('twitter-read-event-consumer')
         # Add kafka package
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.2")
         .getOrCreate())

In [None]:
# Restore schema from json
from pyspark.sql.types import StructType, StructField, BooleanType, LongType, IntegerType, StringType
import json
import os

data_path = os.path.join(os.path.pardir, 'data', 'processed', 'json')
schema_path = data_path + '/tweets.txt'
#tweets_schema = StructType.fromJson(json.loads(schema_path))
#spark.read.json(schema_path)
#json.loads(schema_path)
#tweets_schema = StructType.fieldNames(schema_path)
#tweets_schema = spark.read.load(schema_path)
tweets_schema = StructType(
                    [(StructField("word",StringType(),True),
                     StructField("polarity",StringType(),True),
                     StructField("subjectivity",StringType(),True))])

In [None]:
# Read parquet files as stream to output the number of rows
# Start query stream over stream dataframe

processed_tweet_df = (
    spark
    .readStream
    .format("json")
    .schema(tweets_schema)
    .load(data_path)
)

In [None]:
# Output to memory to count rows
queryStreamMem = (processed_tweet_df
 .writeStream
 .format("memory")
 .queryName("processed_tweets")
 .outputMode("update")
 .start())

In [None]:
from time import sleep
from IPython.display import clear_output

# Count rows every 5 seconds while stream is active
try:
    i=1
    # While stream is active, print count
    while len(spark.streams.active) > 0:
        
        # Clear output
        clear_output(wait=True)
        print("Run:{}".format(i))
        
        lst_queries = []
        for s in spark.streams.active:
            lst_queries.append(s.name)

        # Verify if processed_tweets query is active before count
        if "polarity" in lst_queries:
            # Count number of events
            spark.sql("select polarity, subjectivity from processed_tweets").show()
        else:
            print("'processed_tweets' query not found.")

        sleep(5)
        i=i+1
        
except KeyboardInterrupt:
    # Stop Query Stream
    queryStreamMem.stop()
    
    print("stream process interrupted")

In [None]:
# Check active streams
for s in spark.streams.active:
    print("ID:{} | NAME:{}".format(s.id, s.name))

In [None]:
# Stop ingestion
queryStream.stop()