In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc.stop()

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[*]", "SparkStreaming Exercise")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("/Users/pravinkumar/Documents/Spark/Results/checkPointForStreaming")

In [2]:
# Create a DStream that will connect to hostname:port, like localhost:9999
socket = ssc.socketTextStream("localhost", 9999)

In [None]:
# Joining two files using Broadcasting the rdd
deckDstream = socket.map(lambda rec: rec.split("|")).map(lambda rec: (rec[0], rec[1]))
deckRDD = sc.textFile("/Users/pravinkumar/Documents/Spark/testData/cards/deckofcards.txt").map(lambda rec: rec.split('|')).map(lambda rec: (rec[0], rec[2]))

for i in deckRDD.take(10):print(i)

#print(deckRDD.collectAsMap())
# Broadcasting the deckRDD file
deckRDDBC = sc.broadcast(deckRDD.collectAsMap())
#print(deckRDDBC.value.get('BLACK', -1))
# Error happening while joining RDD within transform function, should figure it out.
deckJoin = deckDstream.map(lambda rdd: (rdd[0], [(deckRDDBC.value.get(rdd[0], -1), rdd[1])]))

#def prnt(x):
#    x.pprint()

# Applying the function prnt to each RDD in a DStream
deckJoin.pprint()

In [4]:
# Count each color in a deck in each batch using reduceByKey
deckDstream = socket.map(lambda rec: rec.split("|")).map(lambda rec: (rec[0], rec[1]))
deckDstrmColor = deckDstream.map(lambda rec: (rec[0], 1))
wordCounts = deckDstrmColor.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [5]:
# Count each color in a deck in each batch using countByValue
# countByValue When called on a DStream of elements of type K, return a new DStream of (K, Long) pairs 
# where the value of each key is its frequency in each RDD of the source DStream.

wordCounts = deckDstrmColor.countByValue()

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [6]:
# Each word count in a DStream using flatMapValue in RDD Transformation. just a different way of trying.

wordCounts = deckDstream.map(lambda rec: (1, rec)).transform(lambda rdd: rdd.flatMapValues(lambda rec: rec)).map(lambda rec: (rec[1], rec[0])).reduceByKey(lambda acc, value: acc + value)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [7]:
# updateStateByKey is used here as like a static storage where all the words with its count is stored 
# and will be incremanted on getting the words and displaying it


# RDD with initial state (key, value) pairs
initialStateRDD = sc.parallelize([(u'hello', 1), (u'world', 1)])

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)

wordCounts = socket.flatMap(lambda rec: rec.split("|")).map(lambda rec: (rec, 1)).updateStateByKey(updateFunc, initialRDD=initialStateRDD)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [None]:
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

-------------------------------------------
Time: 2017-07-09 21:11:50
-------------------------------------------

-------------------------------------------
Time: 2017-07-09 21:11:50
-------------------------------------------

-------------------------------------------
Time: 2017-07-09 21:11:50
-------------------------------------------

-------------------------------------------
Time: 2017-07-09 21:11:50
-------------------------------------------

-------------------------------------------
Time: 2017-07-09 21:11:50
-------------------------------------------
('hello', 1)
('world', 1)

-------------------------------------------
Time: 2017-07-09 21:12:00
-------------------------------------------
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
('BLACK', [('A', 'SPADE')])
...



In [19]:
ssc.stop()