## Create RDDs

Using the SparkContext `sc` or from other data structures

In [7]:
numbers = sc.parallelize([1,2,3,4,5])

## Transformations

Functions like map, filter, flatMap, join, union, intersection etc which modify data

Note: When transformation functions are called, they are **not executed immediately** but only until actions are performed on the RDDs

In [8]:
numbersSquared = numbers.map(lambda x: x**2)

## Actions

Functions like count, collect, reduce, reduceByKey, top which aggregate or compute results

Note: When action functions are called, they are **executed immediately** which trigger transformations if any to be executed as well

In [9]:
numbersSquared.collect()

[1, 4, 9, 16, 25]

## Calculate word frequency in a textfile

In [10]:
lines = sc.textFile('shakespeare.txt') # reads a file as a list of lines into the rdd

In [13]:
words = lines.flatMap(lambda sentence: sentence.lower().split()) # split sentence into words and flatten
words.collect()
# Note: Each entry in the list is a row in RDD

['1609',
 'the',
 'sonnets',
 'by',
 'william',
 'shakespeare',
 '1',
 'from',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase,',
 'that',
 'thereby',
 "beauty's",
 'rose',
 'might',
 'never',
 'die,',
 'but',
 'as',
 'the',
 'riper',
 'should',
 'by',
 'time',
 'decease,',
 'his',
 'tender',
 'heir',
 'might',
 'bear',
 'his',
 'memory:',
 'but',
 'thou',
 'contracted',
 'to',
 'thine',
 'own',
 'bright',
 'eyes,',
 "feed'st",
 'thy',
 "light's",
 'flame',
 'with',
 'self-substantial',
 'fuel,',
 'making',
 'a',
 'famine',
 'where',
 'abundance',
 'lies,',
 'thy',
 'self',
 'thy',
 'foe,',
 'to',
 'thy',
 'sweet',
 'self',
 'too',
 'cruel:',
 'thou',
 'that',
 'art',
 'now',
 'the',
 "world's",
 'fresh',
 'ornament,',
 'and',
 'only',
 'herald',
 'to',
 'the',
 'gaudy',
 'spring,',
 'within',
 'thine',
 'own',
 'bud',
 'buriest',
 'thy',
 'content,',
 'and',
 'tender',
 'churl',
 "mak'st",
 'waste',
 'in',
 'niggarding:',
 'pity',
 'the',
 'world,',
 'or',
 'else',
 'this',
 'g

In [14]:
wordsMap = words.map(lambda word: (word, 1))
wordsMap.collect()

[('1609', 1),
 ('the', 1),
 ('sonnets', 1),
 ('by', 1),
 ('william', 1),
 ('shakespeare', 1),
 ('1', 1),
 ('from', 1),
 ('fairest', 1),
 ('creatures', 1),
 ('we', 1),
 ('desire', 1),
 ('increase,', 1),
 ('that', 1),
 ('thereby', 1),
 ("beauty's", 1),
 ('rose', 1),
 ('might', 1),
 ('never', 1),
 ('die,', 1),
 ('but', 1),
 ('as', 1),
 ('the', 1),
 ('riper', 1),
 ('should', 1),
 ('by', 1),
 ('time', 1),
 ('decease,', 1),
 ('his', 1),
 ('tender', 1),
 ('heir', 1),
 ('might', 1),
 ('bear', 1),
 ('his', 1),
 ('memory:', 1),
 ('but', 1),
 ('thou', 1),
 ('contracted', 1),
 ('to', 1),
 ('thine', 1),
 ('own', 1),
 ('bright', 1),
 ('eyes,', 1),
 ("feed'st", 1),
 ('thy', 1),
 ("light's", 1),
 ('flame', 1),
 ('with', 1),
 ('self-substantial', 1),
 ('fuel,', 1),
 ('making', 1),
 ('a', 1),
 ('famine', 1),
 ('where', 1),
 ('abundance', 1),
 ('lies,', 1),
 ('thy', 1),
 ('self', 1),
 ('thy', 1),
 ('foe,', 1),
 ('to', 1),
 ('thy', 1),
 ('sweet', 1),
 ('self', 1),
 ('too', 1),
 ('cruel:', 1),
 ('thou', 1)

In [15]:
wordFrequencies = wordsMap.reduceByKey(lambda previousValue, currentValue: previousValue + currentValue)
wordFrequencies.collect()

[('shakespeare', 255),
 ('1', 13),
 ('fairest', 39),
 ('creatures', 27),
 ('we', 3201),
 ('increase,', 9),
 ('thereby', 21),
 ("beauty's", 30),
 ('rose', 44),
 ('never', 959),
 ('but', 5881),
 ('as', 5875),
 ('riper', 3),
 ('his', 6749),
 ('tender', 123),
 ('heir', 70),
 ('bear', 465),
 ('memory:', 1),
 ('thou', 5138),
 ('thine', 330),
 ('own', 659),
 ('bright', 52),
 ("feed'st", 3),
 ('thy', 4028),
 ("light's", 1),
 ('flame', 19),
 ('fuel,', 1),
 ('making', 71),
 ('famine', 6),
 ('where', 1151),
 ('abundance', 8),
 ('self', 98),
 ('foe,', 14),
 ('sweet', 707),
 ('cruel:', 1),
 ('now', 1596),
 ("world's", 50),
 ('ornament,', 5),
 ('only', 275),
 ('herald', 27),
 ('churl', 3),
 ("mak'st", 22),
 ('in', 10660),
 ('niggarding:', 1),
 ('pity', 150),
 ('world,', 128),
 ('this', 5859),
 ('due,', 8),
 ('grave', 82),
 ('when', 1990),
 ('forty', 25),
 ('winters', 8),
 ('besiege', 8),
 ('dig', 9),
 ("youth's", 6),
 ('weed', 12),
 ('of', 17952),
 ('worth', 162),
 ('held:', 1),
 ('asked,', 1),
 ('b