In [1]:
import os
import json
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [2]:
# streaming setup
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.5 pyspark-shell'
brokers = "kafka-1:19091,kafka-2:29091,kafka-3:39091"
kafkaParams = {"metadata.broker.list": brokers}
topics = ["CommonStocks"]
sc = SparkContext("local[2]","CommonStocks")
ssc = StreamingContext(sc, 300)
ssc.checkpoint('/tmp')

In [3]:
# read & process
kafka_raw_stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams)
kafka_maped = kafka_raw_stream.map(lambda x: (json.loads(x[1])["SecurityDesc"], 
                                              json.loads(x[1])["TradedVolume"]) ) 

In [4]:
def updateFunction(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
kafka_stream_stateful = kafka_maped.updateStateByKey(updateFunction)

In [5]:
kafka_stream_stateful.repartition(1).saveAsTextFiles("work/outputs/CommonStocks/cumulated-traded-volume")
kafka_stream_stateful.pprint()

In [6]:
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

-------------------------------------------
Time: 2020-06-05 15:35:00
-------------------------------------------
('MERCK KGAA O.N.', 554)
('AROUNDTOWN EO-,01', 6981)
('LINDE PLC        EO 0,001', 1069)
('AIRBUS', 1176)
('BASF SE NA O.N.', 4470)
('RWE AG   INH O.N.', 27721)
('MPC MUENCH.PET.CAP.', 1043)
('SYMRISE AG INH. O.N.', 1402)
('XPHYTO THERAPEUTICS', 1500)
('QSC AG NA O.N.', 5000)
...

-------------------------------------------
Time: 2020-06-05 15:40:00
-------------------------------------------
('MERCK KGAA O.N.', 3324)
('AROUNDTOWN EO-,01', 41886)
('LINDE PLC        EO 0,001', 6414)
('AIRBUS', 7056)
('BASF SE NA O.N.', 26820)
('RWE AG   INH O.N.', 166326)
('MPC MUENCH.PET.CAP.', 6258)
('SYMRISE AG INH. O.N.', 8412)
('XPHYTO THERAPEUTICS', 9000)
('QSC AG NA O.N.', 30000)
...

-------------------------------------------
Time: 2020-06-05 15:45:00
-------------------------------------------
('MERCK KGAA O.N.', 6094)
('AROUNDTOWN EO-,01', 76791)
('LINDE PLC        EO 0,001', 1175

KeyboardInterrupt: 

In [None]:
ssc.stop(True,True)

-------------------------------------------
Time: 2020-06-05 16:15:00
-------------------------------------------
('MERCK KGAA O.N.', 21606)
('AROUNDTOWN EO-,01', 272259)
('LINDE PLC        EO 0,001', 41691)
('AIRBUS', 45864)
('BASF SE NA O.N.', 174330)
('RWE AG   INH O.N.', 1081119)
('MPC MUENCH.PET.CAP.', 40677)
('SYMRISE AG INH. O.N.', 54678)
('XPHYTO THERAPEUTICS', 58500)
('QSC AG NA O.N.', 195000)
...

