# Tweets Visualization

We will see how to use Spark Structured Streaming with Kafka to visualize hashtags counts. 

First we need to get a SparkSession with Kafka support:

In [None]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('tweets-dataviz')
         .config("spark.sql.streaming.schemaInference", True) #Stream dataframe infers schema
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1')
         .config('spark.sql.shuffle.partitions', '1')
         .config('spark.sql.adaptive.enabled', 'false')
         .getOrCreate())
sc = spark.sparkContext

Then we need to start our streaming application:

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import regexp_replace

topic = "savas"
checkpoint = f"file:///tmp/checkpoint_{topic}_1"

# Create DataFrame representing the stream of input lines from Kafka topic
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:29092") \
    .option("subscribe", topic) \
    .load()

lines = df \
    .selectExpr("CAST(value AS STRING)").alias("value") \
    .withColumn('value', regexp_replace('value', '\n', ''))

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count for hashtags
wordCounts = words \
    .filter("word like '%#%'") \
    .groupBy("word") \
    .count() \
    .orderBy('count', ascending=False)

# Start running the query
queryStream = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .option("checkpointLocation", checkpoint) \
    .queryName("wiki_changes") \
    .format("memory") \
    .start()


Finally we can start our visualization. Once it is started, the user can send tweets from his account and and we can see the hashtags count here.

In [None]:
from time import sleep
from IPython.display import clear_output
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rc('font', family='DejaVu Sans')
sns.set(style="whitegrid")


try:
    i=1
    while True:
        # Clear output
        clear_output(wait=True)
        print("**********************")
        print("General Info")
        print("**********************")
        print("Run:{}".format(i))
        if (queryStream.lastProgress):
            
            print("Stream timestamp:{}".format(queryStream.lastProgress["timestamp"]))
            if queryStream.lastProgress["stateOperators"]:
                # print("Watermark:{}".format(queryStream.lastProgress["eventTime"]["watermark"]))
                print("Total Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsTotal"]))
                print("Updated Rows:{}".format(queryStream.lastProgress["stateOperators"][0]["numRowsUpdated"]))
                print("Memory used MB:{}".format((queryStream.lastProgress["stateOperators"][0]["memoryUsedBytes"]) * 0.000001))
            
        df = spark.sql(
                """
                    select word as hashtag, count
                    from wiki_changes
                    order by count desc
                    limit 10
                """
        ).toPandas()

        # Plot the total crashes
        sns.set_color_codes("muted")

        # Initialize the matplotlib figure
        plt.figure(figsize=(8,6))

        print("***********************")
        print("Graph - Top 10 hashtags")
        print("***********************")
        try:
            # Barplot
            sns.barplot(x="count", y="hashtag", data=df)

            # Show barplot
            plt.show()
        except ValueError:
            # If Dataframe is empty, pass
            pass

        print("***********************")
        print("Table - Top 10 hashtags")
        print("***********************")
        display(df)
        

        sleep(10)
        i=i+1
        
except KeyboardInterrupt:
    print("process interrupted.")

# Test

You can test the visualization by producing directly into the topic from the terminal:


>docker exec -it broker /bin/sh                                          

>sh-4.4$ kafka-console-producer --topic savas --bootstrap-server broker:29092

>this is from #console

>I love #console #console #console #console



# Cleanup

In [None]:
# Check active streams
for s in spark.streams.active:
    print("ID:{} | NAME:{}".format(s.id, s.name))


In [None]:
# Stop stream
queryStream.stop()

Everything looks great!