In [1]:
import findspark
findspark.init()
import pyspark

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
# spark=SparkSession.builder.appName('streaming project').master('local').getOrCreate()

## 1. Create a SparkContext with two execution threads, and StreamingContext with batch interval of 1 second.

In [2]:
sc = SparkContext("local[2]", "StreamingProj") #----------> to be executed only 1 time
ssc = StreamingContext(sc, 2)

## 2. Read and cache previous max price into an RDD. Each record of the RDD is a line of text with fields delimited by comma. We need to split the text line at the delimiter into individual strings of stock symbol and previous max price. Convert the previous max stock price to float type.

In [7]:
prev = sc.textFile(r'C:\Users\Miles\Documents\BigData\Spark\SPARK STREAMING MINI PROJECT\previous_max_price.csv')

prevRDD = prev.map(lambda x: x.split(',')) #.map(lambda x : [x[0], float(x[1])])
prevRDD.take(5)

[['JAH', '60.46'],
 ['JAS', '38.06'],
 ['JBI', '28.7'],
 ['JBI', '28.7'],
 ['JBJ', '26.8']]

In [8]:
prevPairedRDD = prevRDD.map(lambda x: [x[0] , float(x[1])])
prevPairedRDD.take(5)

[['JAH', 60.46], ['JAS', 38.06], ['JBI', 28.7], ['JBI', 28.7], ['JBJ', 26.8]]

## 3. Using spark streaming context, we can create a DStream that represents streaming data from a TCP source, specified as hostname (e.g. localhost) and port (e.g. 9999). This DStream represents the stream of data.

In [9]:
curr_stock = ssc.socketTextStream("localhost", 9999)

## 4. Each record in this DStream is a line of text with fields delimited by comma. We will split the test line at the delimiter into individual strings. Convert each record into record to key-value pair with key being the stock symbol and value being a tuple of time stamp string and current price of float type.

In [11]:
# curr_stock = sc.textFile(r'C:\Users\Miles\Documents\BigData\Spark\SPARK STREAMING MINI PROJECT\stock_stream_data.csv')
curr_stock_paired = curr_stock.map(lambda x: x.split(",")).map(lambda x: [x[0], (x[1], float(x[2]))])
# curr_stock_paired.take(5)

## 5. We need to join data stream records with those of the previous max RDD on stock symbol field. We can use the join function, but since it is an RDD-to-RDD function we need to use transform method on the stream so that we can apply it.

In [13]:
joined = curr_stock_paired.transform(lambda rdd: rdd.join(prevPairedRDD))
# joined = curr_stock_paired.join(prevPairedRDD)
# joined.take(5)[0]

## 6. Joined records will be of the form key-value: stock_symbol, ((timestamp,currentprice),previous max price). So, record[0] will be stock_symbol, record[1] willhave 2 fields. In the two fields of record[1] - one is record[1][0] which has (time spamp,current price); the other is record[1][1] which has previous max price.

In [16]:
# pprint(joined[0][1][0])     #------------> getting 2nd element in record 
# joined_paired.take(3)
# joined_paired.count()

## 7. We need to filter for those records in which current price is greater than or equal to previous max price.

In [20]:
filter_currPrice = joined.filter(lambda x: x[1][0][1]  >= x[1][1])
# filter_currPrice = joined_paired.map(lambda x: x[1][1])
# filter_currPrice.count()

## 8. Then we will use pprint (pretty print) function to print record matching the filter condition.

In [21]:
filter_currPrice.pprint()

In [23]:
ssc.start()

In [None]:
# ssc.stop()