In [1]:
# First stoping the SparkContext before initialising the new one
# Note: At a time, only one SparkContext can be active
sc.stop()

In [4]:
from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('local[*]')
conf.setAppName("TopN traded stocks by volume each month from NYSE data within a given year")
conf.set("spark.ui.port", "12345")
conf.set("spark.port.maxRetries", "30")
conf.set("spark.driver.cores", "1")
conf.set("spark.driver.memory", "1g")
conf.set("spark.executor.memory","1g")
# We can also set all this in single line like below code
conf.setAll([("spark.ui.port", "12345"), ("spark.port.maxRetries", "30"), ("spark.driver.cores", "1"), ("spark.driver.memory", "1g"), ("spark.executor.memory","1g")])

sc = SparkContext(conf=conf)

In [4]:
nyseData = sc.textFile("/Users/pravinkumar/Documents/Spark/testData/nyse/nyse_data").map(lambda rec: rec.split(","))
for i in nyseData.take(5): print(i)

['A', '01-Jan-2009', '15.63', '15.63', '15.63', '15.63', '0']
['AA', '01-Jan-2009', '11.26', '11.26', '11.26', '11.26', '0']
['AAP', '01-Jan-2009', '33.65', '33.65', '33.65', '33.65', '0']
['AAV', '01-Jan-2009', '4.21', '4.21', '4.21', '4.21', '0']
['AB', '01-Jan-2009', '20.79', '20.79', '20.79', '20.79', '0']


In [None]:
# Get top N stocks by volume for each day or month
# Use the scala interpreter and preview the data after each step using Spark APIs
# Develop the program using sbt and eclipse or intellij
# Develop topNStocks function - function should take 2 parameters
# First Parameter - Tuple of date or month and then list of stocks for that date or month
# Second Parameter - topN
# Function should sort data in descending order and return top N stocks
# If there are more than N stocks in topN print all stocks (dense rank)
# Compile the jar, ship it and run it on the lab
# Output format - Date or month, Stock Name, Volume (all 3 fields should be delimited by \t)
# Determine number of executors used to run
# Determine number of executor tasks used to run

In [17]:
nyseStocks = sc.textFile("/Users/pravinkumar/Documents/Spark/testData/nyse/nyse_data").map(lambda rec: rec.split(","))

def trimMonth(rec):
    return rec[3:]


def topNStocks(rec, n):
    tops = set()
    topN = set()
    topStocks = []
    for i in rec[1]:
        tops.add(i[0])
    for j in range(0, n):
        maxVolume = max(tops)
        topN.add(maxVolume)
        tops.discard(maxVolume)
    for k in rec[1]:
        if k[0] in topN:
            topStocks.append(k)
    return (rec[0], topStocks)
    
nyseStocksMap = nyseStocks.map(lambda rec: ((trimMonth(rec[1]), rec[0]), float(rec[6]))).reduceByKey(lambda acc, value: acc + value).\
map(lambda rec: (rec[0][0], (rec[1], rec[0][1]))).groupByKey().map(lambda rec: topNStocks(rec, 3))

for i in nyseStocksMap.take(10): print(i)

('Sep-2010', [(2967740200.0, 'BAC'), (1108840100.0, 'F'), (1210677900.0, 'GE')])
('Feb-2013', [(765989400.0, 'GE'), (3261781100.0, 'BAC'), (824464900.0, 'NOK')])
('Jul-2009', [(2107602600.0, 'GE'), (6736111500.0, 'BAC'), (2779726400.0, 'CIT')])
('Nov-2010', [(2621380800.0, 'F'), (3928996400.0, 'BAC'), (1109510800.0, 'GE')])
('Jan-2012', [(1264849600.0, 'F'), (1086516300.0, 'C'), (6063114100.0, 'BAC')])
('May-2009', [(2423500700.0, 'AIG'), (3206448400.0, 'WFC'), (11168887300.0, 'BAC')])
('Jul-2013', [(2461446600.0, 'BAC'), (888221500.0, 'AMD'), (1639313300.0, 'S')])
('Jul-2012', [(1493601200.0, 'S'), (1046139900.0, 'GE'), (2814874000.0, 'BAC')])
('Aug-2013', [(690523500.0, 'JCP'), (738499300.0, 'F'), (1933851400.0, 'BAC')])
('Nov-2011', [(1254006700.0, 'GE'), (5669949800.0, 'BAC'), (1136410100.0, 'C')])
