In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession 
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import Row,SQLContext
import requests
import sys
import time
import json

In [2]:
ss = SparkSession.Builder() \
     .appName("Realtime") \
     .master("spark://spark-master-1:7077") \
     .config("spark.jars", "./spark-streaming-kafka-0-8-assembly_2.11-2.0.0-preview.jar") \
     .getOrCreate()
sc = ss.sparkContext
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint_TwitterApp")
ss.sparkContext.setLogLevel('WARN')

In [3]:
brokers = 'kafka-1:9092,kafka-2:9092'
topic='tweet'

In [4]:
ks = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams={"metadata.broker.list":brokers})

In [5]:
def process_rdd(time, rdd):
    try:
        df = rdd.take(10 + 1)
        top_tags  =[]
        tags_count=[]
        for h in df:
            top_tags.append(h[0])
            tags_count.append(h[1])
        send_df_to_dashboard(top_tags, tags_count)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)

In [6]:
def send_df_to_dashboard(top_tags, tags_count):
    url = 'http://172.19.0.5:5001/updateData'
    request_data = {'label': str(top_tags), 'data': str(tags_count)}
    response = requests.post(url, data=request_data)

In [7]:
def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)

In [8]:
def filter(x):
    if x[0] == '#':
        if len(x) != 1:
            return True
    return False

In [None]:
lines = ks.map(lambda x: json.loads(x[1])['text'] )
hashtag = lines.flatMap(lambda line: line.split())\
                .filter(lambda w: filter(w))\
                .map(lambda x: (x, 1))\
                .updateStateByKey(aggregate_tags_count)\
                .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
hashtag.foreachRDD(process_rdd)  
hashtag.pprint()
ssc.start()
time.sleep(60)
ssc.awaitTermination()

-------------------------------------------
Time: 2022-01-04 08:37:45
-------------------------------------------
('#Tuesday', 1)
('#GodMorningTuesday', 1)
('#fatbikes', 1)
('#fatbike', 1)
('#itswhatsfordinner', 1)
('#patientcare', 1)
('#Ocala,', 1)
('#Barbarena', 1)
('#TikTok', 1)
('#golobos', 1)
...

