In [1]:
!pip install kafka-python
!pip install redis



In [2]:
import os
import requests
import json
import redis
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[2] pyspark-shell'
conf = SparkConf().set("spark.jars","/home/jovyan/work/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
ssc = StreamingContext(sc,60) # 60 is the batch interval :)
debug = True
saveToLocalDisk = True
spark = SparkSession(sc)

In [4]:
def SetVal(x):
    r = redis.StrictRedis(host = 'redis-store', port = 6379)
    r.set('LatestNews',"{\"tasks\":["+str(x)+"]}")
        
def SaveToNewsFile(rdd):
    if not rdd.isEmpty():
        rdd.toDF() \
        .write.save("/home/jovyan/work/NewsData.csv",
                    maxRecordsPerFile=1000, format="csv", mode="append") 
    
def SaveToClickFile(rdd):
    if not rdd.isEmpty():
        rdd.toDF() \
        .write.save("/home/jovyan/work/ClickData.csv",
                    maxRecordsPerFile=1000, format="csv", mode="append")
    
def SaveToFile(rdd):
    if not rdd.isEmpty():
        rdd.toDF( [ "Ticker", "Timestamp", "Price" ] ) \
        .write.save("/home/jovyan/work/FinanceData.csv",
                    maxRecordsPerFile=1000, format="csv", mode="append") 
        
def ConvertToTuple(a):
    if not (type(a) is list): 
        return (a['Timestamp'],a['Title'])
    else:
        return None

In [None]:
kafkaStream = KafkaUtils.createStream(
    ssc=ssc, 
    zkQuorum='zk-cs:2181', 
    groupId='test-consumer-group',#, #Consumer Group 
    topics={'finance_ticker':1,'news_feed':2, 'click':3})

olines = kafkaStream.map(lambda x: x[1]).map(lambda a: json.loads(a))
if(debug):
    olines.pprint()
if(saveToLocalDisk):
    olines.filter(lambda a: ('Type' in a) & (a['Type']=="Finance")) \
    .map(lambda a: (a['Ticker'],a['Timestamp'],a['Price'])) \
    .foreachRDD(SaveToFile)
    
    olines.filter(lambda a: ('Type' in a) & (a['Type']=="News")) \
    .map(lambda a: (a['Timestamp'],a['Title'])) \
    .foreachRDD(SaveToNewsFile)
    
    olines.filter(lambda a: ('Type' in a) & (a['Type']=="Click")) \
    .foreachRDD(SaveToClickFile)
    
lines = olines.filter(lambda a: ('Type' in a) & (a['Type']=="Finance")) \
        .map(lambda a: str(a['Price'])) \
        .reduce(lambda v,agg: agg+", "+v)

if(debug):
    lines.pprint()

lines = olines.filter(lambda a: ('Type' in a) & (a['Type']=="News")) \
        .map(lambda a: json.dumps(a)).reduce(lambda v,agg: agg+", "+v) \
        .map(lambda x: SetVal(x))
    
if(debug):
    lines.pprint()
    
ssc.start()  
print("Going to wait termination")
ssc.awaitTermination()

Going to wait termination
-------------------------------------------
Time: 2019-12-10 05:26:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:26:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:26:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:27:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:27:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:27:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:28:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:28:00
-------------------------------------------

-------------------------------------------
Time: 2019



-------------------------------------------
Time: 2019-12-10 05:31:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:31:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:32:00
-------------------------------------------
{'Type': 'Click', 'Title': "China's November coal imports slump 19% month-on-month on port curbs", 'Timestamp': 1575955879017, 'Link': ''}
{'Type': 'Click', 'Title': "China's November coal imports slump 19% month-on-month on port curbs", 'Timestamp': 1575955879017, 'Link': ''}

-------------------------------------------
Time: 2019-12-10 05:32:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:32:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:33:00
-------------------------------------------
{'Type': 'Click', 'Title': 'China

-------------------------------------------
Time: 2019-12-10 05:52:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:53:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:53:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:53:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:54:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:54:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:54:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:55:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 05:55:00
----------

-------------------------------------------
Time: 2019-12-10 06:13:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:13:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:14:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:14:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:14:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:15:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:15:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:15:00
-------------------------------------------

-------------------------------------------
Time: 2019-12-10 06:16:00
----------

In [None]:
lines = lines.map(lambda a: requests.post(
        "http://serving:8501/v1/models/half_plus_two:predict",
        "{\"instances\": ["+a+"]}").text)
if(debug):
    lines.pprint()

lines = lines.flatMap(lambda x: json.loads(x)["predictions"])
if(debug):
    lines.pprint()

lines = lines.map(lambda x: SetVal(x))
if(debug):
    lines.pprint()