In [1]:
import json

from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SQLContext
import logging

In [2]:
def load_msg(msg):
    message = json.loads(msg[1])
    print msg, message
    return message['words']

In [3]:
def getSqlContextInstance(sparkContext):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']

In [4]:
def process(time, rdd):
    try:
        sqlContext = getSqlContextInstance(rdd.context)
        rowRdd = rdd.flatMap(lambda words: words.split()).map(lambda w: Row(word=w))
        wordsDataFrame = sqlContext.createDataFrame(rowRdd)
        wordsDataFrame.registerTempTable("words")
        wordCountsDataFrame = sqlContext.sql("select word, count(*) as total from words group by word")
        wordCountsDataFrame.show()
        logging.info(wordCountsDataFrame)
    except Exception as e:
        logging.exception(e)

In [5]:
ssc = StreamingContext(sc, 1)
ssc.checkpoint('checkpoint')
kvs = KafkaUtils.createDirectStream(ssc, ['lines'], {"metadata.broker.list": "kafka:9092"})

In [6]:
kvs.map(load_msg).foreachRDD(process)

In [7]:
ssc.start()
ssc.awaitTermination()

+-----------+-----+
|       word|total|
+-----------+-----+
|   Bennet's|    2|
|     heart,|    1|
|        she|   14|
| gentleman;|    1|
|    readily|    1|
|     town."|    1|
|  answered.|    1|
|       back|    1|
|       day,|    1|
|      arise|    1|
|    chiefly|    1|
|  requiring|    1|
|        say|    3|
|   resolute|    1|
|       when|    4|
|alterations|    1|
|  unhappily|    1|
|      would|    4|
|       _he_|    1|
|     making|    1|
+-----------+-----+
only showing top 20 rows

+-----------+-----+
|       word|total|
+-----------+-----+
|   Bennet's|    1|
|        she|    5|
|   feelings|    1|
|      speak|    1|
|retirement,|    1|
|    library|    2|
|       when|    1|
|   humility|    1|
|      would|    1|
|collection,|    1|
|   say--she|    1|
|      house|    2|
|  Longbourn|    1|
|       her;|    1|
|discomposed|    1|
|        The|    2|
|      found|    2|
|   morning,|    1|
|      walk;|    1|
|        had|    9|
+-----------+-----+
only showing t

KeyboardInterrupt: 

+---------+-----+
|     word|total|
+---------+-----+
|      she|    8|
|  staying|    1|
|  chiefly|    1|
|  mention|    1|
|     port|    1|
|  finally|    1|
|   prizes|    1|
|     when|    1|
|   anyone|    1|
|   making|    1|
|  life--"|    1|
|attention|    1|
|told--the|    1|
|   added,|    1|
|  person,|    1|
|   stuffy|    1|
|     wait|    1|
|      The|    2|
|    room,|    1|
|      had|    5|
+---------+-----+
only showing top 20 rows



In [None]:
sql