In [1]:
spark.sparkContext

In [5]:
spark.range(10).rdd

MapPartitionsRDD[19] at javaToPython at NativeMethodAccessorImpl.java:0

In [6]:
spark.range(10).toDF("id").rdd.map(lambda row: row[0])

PythonRDD[25] at RDD at PythonRDD.scala:53

In [7]:
spark.range(10).rdd.toDF()

DataFrame[id: bigint]

In [10]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
    .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)

In [9]:
words.setName("myWords")
words.name()

'myWords'

In [12]:
words.distinct().count()

10

In [13]:
def startsWithS(individual):
    return individual.startswith("S")

In [14]:
words.filter(lambda word: startsWithS(word)).collect()

['Spark', 'Simple']

In [15]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))

In [16]:
words2.filter(lambda record: record[2]).take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]

In [17]:
words2.take(10)

[('Spark', 'S', True),
 ('The', 'T', False),
 ('Definitive', 'D', False),
 ('Guide', 'G', False),
 (':', ':', False),
 ('Big', 'B', False),
 ('Data', 'D', False),
 ('Processing', 'P', False),
 ('Made', 'M', False),
 ('Simple', 'S', True)]

In [20]:
words.flatMap(lambda word: list(word)).take(10)

['S', 'p', 'a', 'r', 'k', 'T', 'h', 'e', 'D', 'e']

In [21]:
words.sortBy(lambda word: len(word) * -1).take(5)

['Definitive', 'Processing', 'Simple', 'Spark', 'Guide']

In [22]:
fiftyFiftSplit = words.randomSplit([0.5], [0.5])

### Actions 

In [23]:
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y)

210

In [28]:
def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

words.reduce(wordLengthReducer)

'Processing'

In [29]:
words.count()

10

In [30]:
confidence = 0.95
timeoutMilliseconds = 400
words.countApprox(timeoutMilliseconds, confidence)

10

In [31]:
words.countApproxDistinct(0.05)

10

In [35]:
words.countByValue()

defaultdict(int,
            {'Spark': 1,
             'The': 1,
             'Definitive': 1,
             'Guide': 1,
             ':': 1,
             'Big': 1,
             'Data': 1,
             'Processing': 1,
             'Made': 1,
             'Simple': 1})

In [37]:
words.first()

'Spark'

In [39]:
spark.sparkContext.parallelize(range(20)).max()

19

In [40]:
words.take(5)
words.takeOrdered(5)
words.top(5)
withReplacement = True
numToTake = 6
randomSeed = 100
words.takeSample(withReplacement, numToTake, randomSeed)

['Data', 'Definitive', 'Data', 'The', 'Definitive', 'Spark']

### Saving Files

In [41]:
words.saveAsTextFile("file:/tmp/bookTitle")

### Cacheing

In [42]:
words.cache()

ParallelCollectionRDD[37] at parallelize at PythonRDD.scala:195

In [43]:
words.getStorageLevel()

StorageLevel(False, True, False, False, 1)

### Pipe RDDs to System Commands

In [44]:
words.pipe("wc -l").collect()

['       5', '       5']

In [45]:
words.mapPartitions(lambda part: [1]).sum()

2

In [47]:
def indexedFunc(partitionIndex, withinPartIterator):
    return ["partition: {} => {}".format(partitionIndex, x) for x in withinPartIterator]
words.mapPartitionsWithIndex(indexedFunc).collect()

['partition: 0 => Spark',
 'partition: 0 => The',
 'partition: 0 => Definitive',
 'partition: 0 => Guide',
 'partition: 0 => :',
 'partition: 1 => Big',
 'partition: 1 => Data',
 'partition: 1 => Processing',
 'partition: 1 => Made',
 'partition: 1 => Simple']

In [49]:
spark.sparkContext.parallelize(["Hello", "World"], 2).glom().collect()

[['Hello'], ['World']]