In [2]:
# Required libraries
import sys
import datetime
import time 

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.streaming import StreamingContext

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as sFuncs
from pyspark.sql.window import Window

In [3]:
sc

## PA5 Question 1

Moving Averages

Calculation of moving stock price averages are part of many a trading strategies ([reference](https://www.investopedia.com/articles/active-trading/052014/how-use-moving-average-buy-stocks.asp)).

We will be using the two moving averages strategy, with the shorter-term MA being 10-day and the longer average being 40-day. When the shorter-term MA crosses above the longer-term MA, it's a buy signal, as it indicates that the trend is shifting up. This is known as a "golden cross."

Meanwhile, when the shorter-term MA crosses below the longer-term MA, it's a sell signal, as it indicates that the trend is shifting down. This is known as a "dead/death cross."

To simulate a data stream, you are given a python program `stream-feeder.py` which reads in `dj30.csv` file and pipes it, line by line. `dj30.csv` contains a 25-year history of the Dow Jones Industrial Average prices. We will only be concerned with the Close price. The command `stream-feeder.py | nc -lk 9999` can be run on the master machine of your spark cluster to feed the Close data into pyspark.

1. Set up the stream to feed data into a pyspark DStream. Write and submit a summary of the steps you took (in English) and enclose the (cleaned up after editing) output of `history > /tmp/my_session.txt`. This history should include what you typed into the shell outside of the pyspark session. \[2 pts\]
2. Use DStream windowing to separately accumulate the sum and count of prices, thus creating moving average DStreams. Write and submit the (cleaned up after editing) transcript of your session along with your code. \[4 pts\]
3. \[Optional, 4 bonus points\]. Compare the two moving averages to indicate buy and sell signals. Your output should be of the form `[( <date> buy), ( <date> sell), etc]`

#### Load the data
Note that this below cell needs to be run once!

In [None]:
# to unpack the dataset into the current directory
# NOTE that this cell needs to run once
# %%bash
# cd /home/saberbf/BigData/PA5
# sudo apt-get install python3-pip
# pip3 install pandas
# pip3 install feedparser
# gsutil cp gs://datathinks-home/stream-feeder.py .
# gsutil cp gs://datathinks-home/dj30.csv .
# gsutil cp gs://datathinks-home/headline-extractor.py .
# gsutil cp gs://datathinks-home/feed-parser.py .
# gsutil cp gs://datathinks-home/2020-headlines.csv .

#### Create Spark Streaming Context

##### Notes:
- In case you are using a single-node with k many threads cluster, it is essential to use setMaster('local[k]') or less than k. Otherwise, SparkContext put the sc.master on 'yarn'. It doesn't consider threads as workers and looks for individual workers to do the job. The result would be you will not see a collect() to converge.
- master is a Spark, Mesos or YARN cluster URL, or a special “local[*]” string to run in local mode. In practice, when running on a cluster, you will not want to hardcode master in the program, but rather launch the application with spark-submit and receive it there. However, for local testing and unit tests, you can pass “local[*]” to run Spark Streaming in-process (detects the number of cores in the local system).

In [5]:
# sc._conf.getAll()
sc.stop()
# Create a local StreamingContext with 4 working threads
conf = SparkConf().setMaster('local[*]')
sc = SparkContext(conf=conf, appName='NetworkWordCount')

# test the SparkContext to see if it works
rdd = sc.parallelize([('a',7),('a',2),('b',2)])
rdd.collect()

[('a', 7), ('a', 2), ('b', 2)]

In [None]:
# to load the data into a DataFrame
spark = SparkSession.builder.master('local[4]').getOrCreate()
dj30DF = spark.read.load('gs://datathinks-home/dj30.csv', 
                     format='csv', inferSchema=True, header=True, delimiter=',')
dj30DF = dj30DF.select(['Close', 
           'Date', 
           'Long Date', 
           'Total Stks above 30MA', 
           'Total Stks below 30MA', 
           'Total Stks equal 30MA'])

# dj30DF.withColumn('movingAverage', sum(dj30DF[Close])).over(Window.rowsBetween(-10,0)).show(10)
dj30DF.show(10)

In [None]:
# Create a local StreamingContext with batch interval of 10 seconds
ssc = StreamingContext(sc, 10)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)
# Split each line into words and transform to (word,count) tuple
words = lines.flatMap(lambda line: line.split(" "))
# reduce last 30 seconds of data, every 10 seconds
# Counting the number of elements in each RDD of the source DStream
words.count().pprint()
ssc.start()             # Start the computation
ssc.awaitTermination(timeout=150)  # Wait for the computation to terminate


## it returns 380 and 382 for two RDDs in this cluster

In [None]:
ssc.stop()#(stopSparkContext=True)

In [None]:
# Create a local StreamingContext with batch interval of 2.5 seconds which should hold to 10 sample per RDD
ssc = StreamingContext(sc, 2.5)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)
# Split each line into words
words = lines.flatMap(lambda line: line.split(" ")).map(lambda word:(word, 1))
# reduce last 30 seconds of data, every 10 seconds
priceSum = words.reduceByWindow(lambda x, y: (float(x[0])+float(y[0]), int(x[1])+int(y[1])),
                              None,#lambda x, y: (float(x[0])-float(y[0]), int(x[1])-int(y[1])),
                              windowDuration=10,
                              slideDuration=2.5)
# Print the first ten elements of each RDD generated in this DStream to the console
priceSum.pprint()
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

In [None]:
ssc.stop()

## PA5 Question 2

Notable News

Most news outlets distribute their news through rss feeds for use by news reader programs. We are writing a news reader that reads the news headlines and only reports those headlines that contain an unfamiliar word. (It's not going to be all that useful but hey...)

The file `2020-headlines.csv` contains headlines from 2020 for mining for familiar words and `headline-extractor.py` for extracting words from such headlines. The program is only half-written. Add to it as follows:

1. Create a Bloom Filter string whose size is approximately 8 times the number of understood words and write the buffer into a text file `bloom.txt` in your shell.
2. The file `bloom.txt` will be used in pyspark. You may want to store it in hdfs so it is accessible from pyspark.
To simulate a data stream, you are given a python program `feed-parser.py` which reads rss feeds from several news outlets. It is rate controlled, feeding us 4 titles per second. The command `feed-parser.py | nc -lk 9999` can be run on the master machine of your spark cluster to feed the titles data into pyspark.

1. Set up the stream to feed data into a pyspark DStream. Write and submit a summary of the steps you took (in English) and enclose the (cleaned up after editing) output of `history > /tmp/my_session.txt`. This history should include what you typed into the shell outside of the pyspark session. \[no points for this\]
2. Use DStream windowing to filter incoming headlines. Use a Bloom filter based on `bloom.txt` to emit only the headlines with unfamiliar words in them. Write and submit the (cleaned up after editing) transcript of your session along with your code. \[4 pts\]