In [1]:
!pip install kafka-python
!pip install redis

Collecting kafka-python
[?25l  Downloading https://files.pythonhosted.org/packages/49/c9/9863483a1353700ba87821b4f39085eb18fd1bcbb1e954c697177d67f03f/kafka_python-1.4.7-py2.py3-none-any.whl (266kB)
[K     |████████████████████████████████| 266kB 1.5MB/s eta 0:00:01
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-1.4.7
Collecting redis
[?25l  Downloading https://files.pythonhosted.org/packages/32/ae/28613a62eea0d53d3db3147f8715f90da07667e99baeedf1010eb400f8c0/redis-3.3.11-py2.py3-none-any.whl (66kB)
[K     |████████████████████████████████| 71kB 1.3MB/s eta 0:00:011
[?25hInstalling collected packages: redis
Successfully installed redis-3.3.11


In [2]:
import os
import requests
import json
import redis
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[2] pyspark-shell'
conf = SparkConf().set("spark.jars","/home/jovyan/work/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
ssc = StreamingContext(sc,60) # 60 is the batch interval :)
debug = True
saveToLocalDisk = True
spark = SparkSession(sc)

In [4]:
def SetVal(x):
    r = redis.StrictRedis(host = 'redis-store', port = 6379)
    r.set(x,int(x))
    
def SaveToFile(rdd):
    if not rdd.isEmpty():
        rdd.toDF( [ "Ticker", "Timestamp", "Price" ] ) \
        .write.save("/home/jovyan/work/FinanceData.csv",
                    maxRecordsPerFile=1000, format="csv", mode="append") 
        
def SaveToNewsFile(rdd):
    if not rdd.isEmpty():
        rdd.toDF( [ "Timestamp", "Title" ] ) \
        .write.save("/home/jovyan/work/NewsData.csv",
                    maxRecordsPerFile=1000, format="csv", mode="append") 
        
def ConvertToTuple(a):
    if not (type(a) is list): 
        return (a['Timestamp'],a['Title'])
    else:
        return None

In [None]:
kafkaStream = KafkaUtils.createStream(
    ssc=ssc, 
    zkQuorum='zk-cs:2181', 
    groupId='test-consumer-group',#, #Consumer Group 
    topics={'finance_ticker':1,'news_feed':2})

olines = kafkaStream.map(lambda x: x[1]).map(lambda a: json.loads(a))
if(debug):
    olines.pprint()
if(saveToLocalDisk):
    olines.filter(lambda a: ('Type' in a) & (a['Type']=="Finance")) \
    .map(lambda a: (a['Ticker'],a['Timestamp'],a['Price'])) \
    .foreachRDD(SaveToFile)
    
    olines.filter(lambda a: ('Type' in a) & (a['Type']=="News")) \
    .map(lambda a: (a['Timestamp'],a['Title'])) \
    .foreachRDD(SaveToNewsFile)
    
lines = olines.filter(lambda a: ('Type' in a) & (a['Type']=="Finance")) \
        .map(lambda a: str(a['Price'])) \
        .reduce(lambda v,agg: agg+", "+v)
if(debug):
    lines.pprint()

ssc.start()  
print("Going to wait termination")
ssc.awaitTermination()

Going to wait termination
-------------------------------------------
Time: 2019-11-30 04:25:00
-------------------------------------------

-------------------------------------------
Time: 2019-11-30 04:25:00
-------------------------------------------

-------------------------------------------
Time: 2019-11-30 04:25:00
-------------------------------------------

-------------------------------------------
Time: 2019-11-30 04:25:00
-------------------------------------------

-------------------------------------------
Time: 2019-11-30 04:25:00
-------------------------------------------

-------------------------------------------
Time: 2019-11-30 04:26:00
-------------------------------------------
{'Ticker': 'MSFT', 'Timestamp': 1575050401, 'Price': 151.38}
{'Ticker': 'AAPL', 'Timestamp': 1575050401, 'Price': 267.25}
{'Ticker': 'MSFT', 'Timestamp': 1575050401, 'Price': 151.38}
{'Ticker': 'AAPL', 'Timestamp': 1575050401, 'Price': 267.25}

----------------------------------------

In [None]:
lines = lines.map(lambda a: requests.post(
        "http://serving:8501/v1/models/half_plus_two:predict",
        "{\"instances\": ["+a+"]}").text)
if(debug):
    lines.pprint()

lines = lines.flatMap(lambda x: json.loads(x)["predictions"])
if(debug):
    lines.pprint()

lines = lines.map(lambda x: SetVal(x))
if(debug):
    lines.pprint()