In [1]:
from common import Common
from music import Music
from pyspark import *
from pyspark.streaming import *
from pyspark import SparkContext, SparkConf

In [2]:
common = Common()
sc = common.get_sc()

<SparkContext master=local appName=myapp>
2.4.5


### Read data from a file

In [3]:
rdd = sc.textFile('/home/ec2-user/data/wordcount.file1')
print(rdd.first())
rdd.take(2)

A wonderful king is Hadoop.


['A wonderful king is Hadoop.', 'The elephant plays well with Sqoop.']

### Read data from a variable

In [4]:
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
print(rdd.first())
rdd.take(2)

1


[1, 2]

### Map Transformation

In [5]:
print ('Map')
def multiply_by_two(x:int) -> int:
    return x*2
    
data = range(0,100)
rdd = sc.parallelize(data)
print ('input rdd: ', rdd.take(5))

rdd = rdd.map(multiply_by_two)
print ('output rdd: ', rdd.take(5))

Map
input rdd:  [0, 1, 2, 3, 4]
output rdd:  [0, 2, 4, 6, 8]


### Filter Transformation

In [6]:
print ('Filter')
def filter_by_even(x:int) -> int:
    return x%2==0

data = range(0,100)
rdd = sc.parallelize(data)
print ('input rdd: ', rdd.take(10))

rdd = rdd.filter(filter_by_even)
print ('output rdd: ', rdd.take(10))

Filter
input rdd:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
output rdd:  [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]


### FlatMap Transformation

In [7]:
print ('FlatMap in function')
def flatten(x):
    return x[1].split(',')

data = [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
rdd = sc.parallelize(data)
print ('input rdd: ', rdd.take(3))

rdd = rdd.flatMap(flatten)
print ('output rdd: ', rdd.take(6))

FlatMap in function
input rdd:  [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
output rdd:  ['1', '2', '3', '5', '6', '7']


In [8]:
print ('FlatMap in lambda')
data = [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
rdd = sc.parallelize(data)
print ('input rdd: ', rdd.take(3))

rdd = rdd.flatMap(lambda x: (x[1].split(',')))
print ('output rdd: ', rdd.take(6))

FlatMap in lambda
input rdd:  [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
output rdd:  ['1', '2', '3', '5', '6', '7']


### FlatMapValues Transformation

In [9]:
print ('FlatMapValues in function')
def flatten(x):
    return x.split(' ')

data = [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
rdd = sc.parallelize(data)
print ('input rdd: ', rdd.take(3))

rdd = rdd.flatMapValues(flatten)
print ('output rdd: ', rdd.take(6))

FlatMapValues in function
input rdd:  [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
output rdd:  [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]


In [10]:
print ('FlatMapValues in lambda')
data = [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
rdd = sc.parallelize(data)
print ('input rdd: ', rdd.take(3))

rdd = rdd.flatMapValues(lambda x: x.split(','))
print ('output rdd: ', rdd.take(6))

FlatMapValues in lambda
input rdd:  [('A', '1,2,3'), ('B', '5,6,7'), ('C', '8,9,10')]
output rdd:  [('A', '1'), ('A', '2'), ('A', '3'), ('B', '5'), ('B', '6'), ('B', '7')]


### Working with files and classes

In [11]:
file = '../data/spotify.csv'
rdd = sc.textFile(file)

header = rdd.first()
print ('header: ', header)

rdd = rdd.filter(lambda x: x != header)
print ('first csv: ', rdd.first())

rdd = rdd.map(lambda x: x.split(',')).map(lambda x: (x[5], Music(x[0], x[1], x[2], x[3], x[4], x[5])))
print ('first class: ', rdd.first())


header:  acousticness,danceability,duration_ms,energy,song_title,artist
first csv:  0.0102,0.833,204600,0.434,Mask Off,Future
first class:  ('Future', <music.Music object at 0x7f8b58377190>)


In [12]:
def print_music(music:Music):
        return ('[acoustic: ' + music.acoustic + '] ' +
               '[dance: ' + music.dance + '] ' +
               '[duration: ' + music.duration + '] ' +
               '[energy: ' + music.energy + '] ' +
               '[title: ' + music.title + '] ' +
               '[artist: ' + music.artist+ ']')
print_music(rdd.first()[1])

'[acoustic: 0.0102] [dance: 0.833] [duration: 204600] [energy: 0.434] [title: Mask Off] [artist: Future]'

### Partitions and re-partitions

In [13]:
print ('Current partitions: ', rdd.getNumPartitions())
print ('Record count: ', rdd.count())

rdd = rdd.repartition(2)
print ('\nNew partitions: ', rdd.getNumPartitions())
print ('Record count: ', rdd.count())

Current partitions:  1
Record count:  2017

New partitions:  2
Record count:  2017


### MapPartition functions

In [14]:
rdd.mapPartitionsWithIndex{
    lambda index, iterator: {
        print("Called in Partition -> " + index)
        my_list = list(iterator)
        my_list.map(lambda x: x + " -> " + index).iterator
    }
}

SyntaxError: invalid syntax (<ipython-input-14-980159759669>, line 1)