In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import Row,SQLContext
import requests
import sys
import time
import json

In [2]:
ss = SparkSession.Builder() \
     .appName("Realtime") \
     .master("spark://spark-master-1:7077") \
     .config("spark.jars", "./spark-streaming-kafka-0-8-assembly_2.11-2.0.0-preview.jar") \
     .getOrCreate()
sc = ss.sparkContext
ssc = StreamingContext(sc, 2)
ssc.checkpoint("checkpoint_TwitterApp")
ss.sparkContext.setLogLevel('WARN')

In [3]:
brokers = 'kafka-1:9092,kafka-2:9092'
topic='tweet'

In [4]:
ks = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams={"metadata.broker.list":brokers})

In [5]:
def get_sql_context_instance(spark_context):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
    return globals()['sqlContextSingletonInstance']
def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    try:
        # Get spark sql singleton context from the current context
        sql_context = get_sql_context_instance(rdd.context)
        # convert the RDD to Row RDD
        row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
        # create a DF from the Row RDD
        hashtags_df = sql_context.createDataFrame(row_rdd)
        # Register the dataframe as table
        hashtags_df.registerTempTable("hashtags")
        # get the top 10 hashtags from the table using SQL and print them
        hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10")
        hashtag_counts_df.show()
        # call this method to prepare top 10 hashtags DF and send them
        send_df_to_dashboard(hashtag_counts_df)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)

In [6]:
def send_df_to_dashboard(df):
    top_tags = [str(t.hashtag) for t in df.select("hashtag").collect()]
    tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()]
    url = 'http://172.30.0.13:5001/updateData'
    request_data = {'label': str(top_tags), 'data': str(tags_count)}
    response = requests.post(url, data=request_data)

In [7]:
def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)

In [8]:
def filter(x):
    if x[0] == '#':
        if len(x) != 1:
            return True
    return False

In [None]:
# lines = ks.window(30,10)
lines = ks.map(lambda x: json.loads(x[1])['text'] )
words = lines.flatMap(lambda line: line.split())
hashtags = words.filter(lambda w: filter(w)).map(lambda x: (x, 1))
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
tags_totals.foreachRDD(process_rdd)    
ssc.start()
time.sleep(1000)
ssc.awaitTermination()

----------- 2022-01-03 17:05:22 -----------
Error: <class 'ValueError'>
----------- 2022-01-03 17:05:24 -----------
Error: <class 'ValueError'>
----------- 2022-01-03 17:05:26 -----------
+---------------+-------------+
|        hashtag|hashtag_count|
+---------------+-------------+
|       #Massage|            1|
|        #TikTok|            1|
|#الحالات_الحرجه|            1|
+---------------+-------------+

----------- 2022-01-03 17:05:28 -----------
+---------------+-------------+
|        hashtag|hashtag_count|
+---------------+-------------+
|       #Massage|            1|
|        #TikTok|            1|
|#الحالات_الحرجه|            1|
+---------------+-------------+

----------- 2022-01-03 17:05:30 -----------
Error: <class 'py4j.protocol.Py4JJavaError'>
----------- 2022-01-03 17:05:32 -----------
+--------------------+-------------+
|             hashtag|hashtag_count|
+--------------------+-------------+
|            #Massage|            1|
|             #TikTok|            1|


----------- 2022-01-03 17:06:08 -----------
+--------------+-------------+
|       hashtag|hashtag_count|
+--------------+-------------+
|      #traffic|            2|
|         #feed|            2|
|      #Massage|            1|
|       #TikTok|            1|
|      #TeamISP|            1|
|       #Brasil|            1|
|           #UK|            1|
|          #USA|            1|
|       #dorset|            1|
|#lovefordorset|            1|
+--------------+-------------+

----------- 2022-01-03 17:06:10 -----------
Error: <class 'py4j.protocol.Py4JJavaError'>
----------- 2022-01-03 17:06:12 -----------
+--------------+-------------+
|       hashtag|hashtag_count|
+--------------+-------------+
|      #traffic|            2|
|         #feed|            2|
|      #Massage|            1|
|       #TikTok|            1|
|      #TeamISP|            1|
|       #Brasil|            1|
|           #UK|            1|
|          #USA|            1|
|       #dorset|            1|
|#lovefordorset|

----------- 2022-01-03 17:06:50 -----------
Error: <class 'py4j.protocol.Py4JJavaError'>
----------- 2022-01-03 17:06:52 -----------
+--------------+-------------+
|       hashtag|hashtag_count|
+--------------+-------------+
|      #traffic|            2|
|         #feed|            2|
|      #Massage|            1|
|       #TikTok|            1|
|      #TeamISP|            1|
|       #Brasil|            1|
|           #UK|            1|
|          #USA|            1|
|       #dorset|            1|
|#lovefordorset|            1|
+--------------+-------------+

----------- 2022-01-03 17:06:54 -----------
+--------------+-------------+
|       hashtag|hashtag_count|
+--------------+-------------+
|      #traffic|            2|
|         #feed|            2|
|      #Massage|            1|
|       #TikTok|            1|
|      #TeamISP|            1|
|       #Brasil|            1|
|           #UK|            1|
|          #USA|            1|
|       #dorset|            1|
|#lovefordorset|

----------- 2022-01-03 17:07:30 -----------
Error: <class 'py4j.protocol.Py4JJavaError'>
----------- 2022-01-03 17:07:32 -----------
+----------------+-------------+
|         hashtag|hashtag_count|
+----------------+-------------+
|#TejasswiPrakash|            4|
|        #traffic|            2|
|  #MondayMorning|            2|
|    #NewYear2022|            2|
|           #feed|            2|
|         #pdx911|            2|
|          #odaat|            2|
|        #Massage|            1|
|         #TikTok|            1|
|        #TeamISP|            1|
+----------------+-------------+

----------- 2022-01-03 17:07:34 -----------
+----------------+-------------+
|         hashtag|hashtag_count|
+----------------+-------------+
|#TejasswiPrakash|            4|
|        #traffic|            2|
|  #MondayMorning|            2|
|    #NewYear2022|            2|
|           #feed|            2|
|         #pdx911|            2|
|          #odaat|            2|
|        #Massage|            

----------- 2022-01-03 17:08:10 -----------
Error: <class 'py4j.protocol.Py4JJavaError'>
----------- 2022-01-03 17:08:12 -----------
+-----------------+-------------+
|          hashtag|hashtag_count|
+-----------------+-------------+
|          #crypto|            5|
| #TejasswiPrakash|            4|
| #saitamawolfpack|            4|
|       #saitamask|            4|
|#cryptocurrencies|            4|
|     #holdsaitama|            4|
|  #thepeoplescoin|            4|
|         #saitama|            4|
|            #defi|            4|
|         #altcoin|            4|
+-----------------+-------------+

----------- 2022-01-03 17:08:14 -----------
+-----------------+-------------+
|          hashtag|hashtag_count|
+-----------------+-------------+
|          #crypto|            5|
| #TejasswiPrakash|            4|
| #saitamawolfpack|            4|
|       #saitamask|            4|
|#cryptocurrencies|            4|
|     #holdsaitama|            4|
|  #thepeoplescoin|            4|
|     