# A first example

In [6]:
# Distribute the data set to the workers
xs = sc.parallelize(range(10))
xs.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
xs.getNumPartitions()

16

In [8]:
# Return all data within each partition as a list.
xs.glom().collect()

[[], [0], [], [1], [2], [], [3], [4], [], [5], [], [6], [7], [], [8], [9]]

In [12]:
# Only keep even numbers
xs = xs.filter(lambda x: x % 2 == 0)
xs.collect()

[0, 2, 4, 6, 8]

In [13]:
# Square all elements
xs = xs.map(lambda x: x**2)
xs.collect()

[0, 4, 16, 36, 64]

In [14]:
xs.reduce(lambda x, y: x+y)

120

## A common Spark idiom chains mutiple functions together

In [15]:
(
    sc.parallelize(range(10))
    .filter(lambda x: x % 2 == 0)
    .map(lambda x: x**2)
    .collect()
)

[0, 4, 16, 36, 64]

# Actions and transforms

In [16]:
# A transform maps an RDD to another RDD - it is a lazy operation. To actually perform any work, we need to apply an action.
import numpy as np

In [76]:
x = sc.parallelize(np.random.randint(1, 6, 10))

In [77]:
x.collect()

[1, 5, 2, 3, 5, 2, 3, 2, 1, 2]

In [78]:
x.take(5)

[1, 5, 2, 3, 5]

In [79]:
x.first()

1

In [80]:
x.top(5)

[5, 5, 3, 3, 2]

In [81]:
x.takeSample(True, 15)

[1, 1, 2, 3, 1, 1, 1, 1, 5, 3, 2, 3, 5, 2, 1]

In [82]:
x.count()

10

In [83]:
x.countByValue()

defaultdict(int, {1: 2, 2: 4, 3: 2, 5: 2})

In [84]:
x.sum()

26

In [85]:
x.max()

5

In [86]:
x.mean()

2.6000000000000001

In [87]:
x.stats()

(count: 10, mean: 2.6, stdev: 1.35646599663, max: 5.0, min: 1.0)

In [88]:
import os
import shutil
if os.path.exists('/resources/data/Datasets/x'):
    shutil.rmtree('/resources/data/Datasets/x')
x.saveAsTextFile('/resources/data/Datasets/x')

In [89]:
!ls /resources/data/Datasets/x

part-00000  part-00003	part-00006  part-00009	part-00012  part-00015
part-00001  part-00004	part-00007  part-00010	part-00013  _SUCCESS
part-00002  part-00005	part-00008  part-00011	part-00014


In [90]:
!cat /resources/data/Datasets/x/*

1
5
2
3
5
2
3
2
1
2


# Fold, reduce and aggregate actions

### Reduce

In [91]:
# max using reduce
x.reduce(lambda x, y: x if x > y else y)

5

In [92]:
# min using reduce
x.reduce(lambda x, y: x if x < y else y)

1

In [93]:
# sum using reduce
x.reduce(lambda x,y: x+y)

26

In [94]:
# prod using reduce
x.reduce(lambda x,y: x*y)

3600

### Fold

In [95]:
# sum using fold
x.fold(0, lambda x, y: x+y)

26

In [96]:
# product of n numbers using fold
x.fold(1, lambda x, y: x*y)

3600

In [97]:
# max using fold
x.fold(0, lambda x, y : x if x > y else y)

5

In [98]:
# min using fold
x.fold(5, lambda x, y : x if x < y else y) # initial value for comparision is from fold value here 5  

1

In [99]:
# count using fold
x.fold(0, lambda x,y: x+1)  # wrong 

16

### Aggregate

In [100]:
# sum using aggregate
x.aggregate(0, lambda x, y: x + y, lambda x, y: x + y)

26

In [101]:
# aggregate - count
x.aggregate(0, lambda x, y: x + 1, lambda x, y: x+y)

10

In [67]:
# aggregate - sum and count
(x.aggregate((0,0), 
             (lambda acc, val: (acc[0]+val, acc[1] + 1 )), 
             (lambda acc1, acc2: ((acc1[0] + acc2[0]), (acc1[1] + acc2[1])))))   

(31, 10)

In [58]:
# aggregate - sum and count
tup = (x.aggregate((0,0), 
             (lambda x, y: (x[0]+y, x[1] + 1 )), 
             (lambda x, y: (x[0] + y[0], x[1] + y[1])))
)
print(x.collect())
print("Number of elements : {0} \nSum of elements : {1}".format(tup[0],tup[1]))

[5, 3, 1, 4, 2, 2, 2, 2, 2, 3]
Number of elements : 26 
Sum of elements : 10


In [102]:
# mean 
tup[0]/tup[1]

2.6000000000000001

# Map

In [103]:
x = sc.parallelize([3,3,4,6])
x.map(lambda x: x + 1).collect()

[4, 4, 5, 7]

In [104]:
fruits = sc.parallelize(["apple", "banana", "orange"])
fruits.map(lambda x: x.upper()).collect()

['APPLE', 'BANANA', 'ORANGE']

In [105]:
# Think of flatmap as a map follwed by a flatten option 
fruits.flatMap(lambda x: x.upper()).collect()

['A',
 'P',
 'P',
 'L',
 'E',
 'B',
 'A',
 'N',
 'A',
 'N',
 'A',
 'O',
 'R',
 'A',
 'N',
 'G',
 'E']

In [106]:
# filter
x.filter(lambda x: x%3 == 0).collect()

[3, 3, 6]

In [113]:
x.sample(False, 0.5,1).collect()

[3, 6]

In [114]:
fruits.sample(False, 0.3).collect()

['orange']

### Set-like transforms

In [115]:
x = sc.parallelize([1,2,3,4])
y = sc.parallelize([3,3,4,6])

In [116]:
y.distinct().collect()

[3, 4, 6]

In [117]:
x.union(y).collect()

[1, 2, 3, 4, 3, 3, 4, 6]

In [118]:
x.intersection(y).collect()

[3, 4]

In [119]:
x.subtract(y).collect()

[1, 2]

In [120]:
y.subtract(x).collect()

[6]

In [121]:
x.cartesian(y).collect()

[(1, 3),
 (1, 3),
 (1, 4),
 (1, 6),
 (2, 3),
 (2, 3),
 (2, 4),
 (2, 6),
 (3, 3),
 (3, 3),
 (3, 4),
 (3, 6),
 (4, 3),
 (4, 3),
 (4, 4),
 (4, 6)]

# Working with key-value pairs
RDDs consissting of key-value pairs are required for many Spark operatinos. They can be created by using a function that returns an RDD composed of tuples.

In [122]:
data = [('google', 1), ('apple', 2)]

In [123]:
rdd = sc.parallelize(data)

In [127]:
rdd.keys().collect()

['google', 'apple']

In [126]:
rdd.values().collect()

[1, 2]

In [128]:
ulysses = sc.textFile('/resources/data/Datasets/Ulysses.txt')

In [129]:
ulysses.take(10)

['',
 '',
 '',
 'The Project Gutenberg EBook of Ulysses, by James Joyce',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with almost',
 'no restrictions whatsoever. You may copy it, give it away or re-use',
 'it under the terms of the Project Gutenberg License included with this',
 'eBook or online at www.gutenberg.org',
 '']

In [130]:
ulysses.flatMap(lambda line: line.lower().split()).count()

267938

In [152]:
import string
def tokenize(line):
    # ord('a') returns the integer 97
    # string.punctuation => It is a constant '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    remove_punc_map = dict.fromkeys(map(lambda x: ord(x), string.punctuation),None) 
    return line.translate(remove_punc_map).lower().split()

words = (ulysses.flatMap(lambda line: tokenize(line))
                .map(lambda x: (x, 1))
        )
words.take(5)

[('the', 1), ('project', 1), ('gutenberg', 1), ('ebook', 1), ('of', 1)]

In [153]:
counts = words.reduceByKey(lambda x, y: x+y)
counts.take(5)

[('kyries', 1),
 ('mobile', 2),
 ('rabaiotti’s', 3),
 ('gasteropod', 1),
 ('spurt', 1)]

In [154]:
counts.takeOrdered(10, key= lambda x: -x[1])

[('the', 15010),
 ('of', 8251),
 ('and', 7216),
 ('a', 6510),
 ('to', 5029),
 ('in', 4972),
 ('he', 3998),
 ('his', 3326),
 ('that', 2586),
 ('with', 2557)]

In [158]:
(
ulysses.flatMap(lambda line: tokenize(line))
       .map(lambda word: (word, 1))
       .reduceByKey(lambda x, y: x + y)
       .takeOrdered(10, key=lambda x: -x[1])
)

[('the', 15010),
 ('of', 8251),
 ('and', 7216),
 ('a', 6510),
 ('to', 5029),
 ('in', 4972),
 ('he', 3998),
 ('his', 3326),
 ('that', 2586),
 ('with', 2557)]

# Persisting data

In [161]:
counts.is_cached

False

In [162]:
counts.persist()

PythonRDD[196] at RDD at PythonRDD.scala:43

In [163]:
counts.is_cached

True

In [164]:
counts.takeOrdered(5, lambda x: -x[1])

[('the', 15010), ('of', 8251), ('and', 7216), ('a', 6510), ('to', 5029)]

In [165]:
counts.take(5)

[('kyries', 1),
 ('mobile', 2),
 ('rabaiotti’s', 3),
 ('gasteropod', 1),
 ('spurt', 1)]

In [166]:
counts.takeOrdered(5, lambda x: x[0])

[('0', 1), ('0—0—1', 5), ('0—0—2', 1), ('0—0—3', 2), ('0—0—4', 3)]

In [167]:
counts.keys().take(5)

['kyries', 'mobile', 'rabaiotti’s', 'gasteropod', 'spurt']

In [168]:
counts.values().take(5)

[1, 2, 3, 1, 1]

In [177]:
count_dict = counts.collectAsMap()
count_dict

{'eclipses': 2,
 'cudgel': 1,
 'lose': 16,
 'vinery': 1,
 'myths': 1,
 'needful': 3,
 'revengeful': 1,
 'clergymen’s': 1,
 'spike': 1,
 'gipsylike': 1,
 'shameface': 1,
 'hoping': 5,
 'romps': 1,
 'hayforks': 1,
 'produce': 9,
 'glided': 3,
 'exhortator': 1,
 'abounding': 1,
 'cutty': 1,
 'foregone': 1,
 'o’donoghue': 2,
 'fiddling': 1,
 'scholastica': 1,
 'skirties': 1,
 'ozone': 1,
 'anxiously': 4,
 'bloowhose': 1,
 'jeer': 4,
 'indispensable': 2,
 'garbed': 2,
 'diseases': 2,
 'courtesies': 1,
 'tapped': 9,
 'artisans’': 1,
 'sometime': 5,
 'duke': 14,
 'bolt': 6,
 'readers': 3,
 'gorget': 1,
 'pipe': 16,
 'darling': 22,
 'falls': 13,
 'champing': 4,
 'm’intosh': 4,
 'nullify': 1,
 'eh': 49,
 'door': 139,
 'unmixed': 1,
 'howsomever': 1,
 'ontario': 4,
 'lousy': 2,
 'gold': 92,
 'peru': 1,
 'ferrywash': 1,
 'drove': 23,
 'haggling': 1,
 'tones': 1,
 'tinily': 2,
 'soak': 1,
 'brothers': 16,
 'noone': 34,
 'ripping': 2,
 'drinks': 8,
 'philosophised': 1,
 'o’callaghan': 2,
 'thaumatu

In [178]:
count_dict['circle']

20

# Using cache instead of persist

In [179]:
counts.unpersist()

PythonRDD[196] at RDD at PythonRDD.scala:43

In [180]:
counts.is_cached

False

In [181]:
counts.cache()

PythonRDD[196] at RDD at PythonRDD.scala:43

In [182]:
counts.is_cached

True

# Merging key, value datasets

In [183]:
portrait = sc.textFile('/resources/data/Datasets/Portrait.txt')

In [184]:
portrait.take(1)

["Project Gutenberg's A Portrait of the Artist as a Young Man, by James Joyce"]

In [185]:
count1 = (
        portrait.flatMap(lambda line: tokenize(line))
                .map(lambda word: (word,1))
                .reduceByKey(lambda a, b : a + b)
)

In [186]:
count1.persist()

PythonRDD[214] at RDD at PythonRDD.scala:43

# Combine counts for words found in both books

In [187]:
all_counts = counts.join(count1)
all_counts.take(5)

[('mobile', (2, 1)),
 ('vastness', (1, 1)),
 ('circle', (20, 1)),
 ('temptations', (1, 4)),
 ('withering', (4, 1))]

In [189]:
#sum counts over words
all_counts.mapValues(lambda x: x[0] +x[1]).take(10)

[('mobile', 3),
 ('vastness', 2),
 ('circle', 21),
 ('temptations', 5),
 ('withering', 5),
 ('spoken', 31),
 ('belt', 15),
 ('why', 267),
 ('shallow', 6),
 ('preceding', 12)]

In [188]:
all_counts.map(lambda x: (x[0], x[1][0] +x[1][1])).take(10)

[('mobile', 3),
 ('vastness', 2),
 ('circle', 21),
 ('temptations', 5),
 ('withering', 5),
 ('spoken', 31),
 ('belt', 15),
 ('why', 267),
 ('shallow', 6),
 ('preceding', 12)]

In [190]:
all_counts.mapValues(lambda x: np.mean(x)).take(5)

[('mobile', 1.5),
 ('vastness', 1.0),
 ('circle', 10.5),
 ('temptations', 2.5),
 ('withering', 2.5)]