In [0]:
sc = spark.sparkContext

In [0]:
retailAll = 'dbfs:/mnt/data/data/retail-data/all/'
flightData2010 = 'dbfs:/mnt/data/data/flight-data/parquet/2010-summary.parquet'

### Resilient Distributed Datasets (RDDs)

-  **Spak operates on a per-partition basis when executing code**
-  Basically all DF Spark code compiles down to an RDD
-  When calling a DF transformation, the underlying logic becomes a set of RDD transformations
-  SparkContext is the entry point for low-level API functionality accessed through SparkSession
-  Main reason to use RDDs is for fine grained control over physical distribution of data (**custom partitioning of data**)
-  RDD performance is best via Scala/Java
<br>
-  RDDs:
    -  Caching:
        -  ability to cache or persists an RDD
        -  ability to specify a storage level [org.apache.spark.storage.StorageLevel] (combinations of memory only, disk only, and off heap)
    -  Checkpointing:
        -  saves RDD to risk so future computations on that RDD point to its partitions on disk rather than recomputing the RDD from the original source
        -  similar to caching except checkpointing is stored only on disk and not in memory (like cache)
        -  when checkpointed RDD is referenced it derives from checkpoint instead of source data, which helps improve performance and optimization
        <br>
-  Shared Variables:
    -  broadcast variables
    -  accumulators

### _RDD to DF Example_

In [0]:
rdd1 = spark.range(3).rdd
rdd1.collect()

[Row(id=0), Row(id=1), Row(id=2)]

In [0]:
for i in spark.range(3).rdd.collect(): print(i)

Row(id=0)
Row(id=1)
Row(id=2)


In [0]:
spark.range(3).rdd.toDF().show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+



### _Local Collection to RDD Example_

In [0]:
myCollection = "Training on Python, Spark, Scala and on ML".split(" ")


In [0]:
myCollection

['Training', 'on', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'ML']

In [0]:
words = spark.sparkContext.parallelize(myCollection, 2) # sets number of partitions

In [0]:
words.getNumPartitions()

2

In [0]:
words.setName("myWords") # names app for Spark UI
for i in words.collect(): print(i)

Training
on
Python,
Spark,
Scala
and
on
ML


### _RDD Data Source Read Example_

### RDD Transformation Examples:

In [0]:
# distinct
print(words.distinct().count())

7


In [0]:
print(words.collect())
print(words.distinct().collect())

['Training', 'on', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'ML']
['Spark,', 'Training', 'on', 'Python,', 'Scala', 'and', 'ML']


In [0]:
# filter
def startsWithS(individual):
  return individual.startswith("S")

print(words.filter(lambda word: startsWithS(word)).collect())


['Spark,', 'Scala']


In [0]:
# map

words2 = words.map(
    lambda word: 
        (
            word, word[0], word.startswith("S")
        )
)

In [0]:
words2.collect()

[('Training', 'T', False),
 ('on', 'o', False),
 ('Python,', 'P', False),
 ('Spark,', 'S', True),
 ('Scala', 'S', True),
 ('and', 'a', False),
 ('on', 'o', False),
 ('ML', 'M', False)]

In [0]:
print(words2.filter(
    lambda record: record[2]
).collect())

[('Spark,', 'S', True), ('Scala', 'S', True)]


In [0]:
# sort
print(words.sortBy(lambda word: len(word) * -1).collect())

['Training', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'on', 'ML']


### RDD Actions Examples

In [0]:
# reduce
print(spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y))

210


In [0]:
words.collect()

['Training', 'on', 'Python,', 'Spark,', 'Scala', 'and', 'on', 'ML']

In [0]:
# countByValue
print(words.countByValue())

defaultdict(<class 'int'>, {'Training': 1, 'on': 2, 'Python,': 1, 'Spark,': 1, 'Scala': 1, 'and': 1, 'ML': 1})


In [0]:
# first
print(words.first())

Training


In [0]:
# take
print(words.take(3)) # returns values
print(words.takeOrdered(3)) # asc order

['Training', 'on', 'Python,']
['ML', 'Python,', 'Scala']


### _RDD TXT Save (Uncompressed & Compressed) Example_

In [0]:
import shutil
#shutil.rmtree("words_atin")

words.saveAsTextFile("words_atin")

In [0]:
import os

In [0]:
import shutil
#shutil.rmtree("wordsCompressed_atin")

words.saveAsTextFile("wordsCompressed_atin", \
                     compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

## Distributed Shared Variables_

-  Broadcast Variables:
    -  saves large value on all worker nodes without re-sending to cluster every time (ex: lookup table as function that fits in memory on each executor)
    -  avoids deserialization per task on the worker nodes every time variable is used
    -  shared immutable variables that are cached on every machine in cluster instead of serialized with every single task
    -  the cost of serializing data for every task can be quite expensive thus broadcast variables are a good alternative   
<br>
-  Accumulators:
    -  adds data together from all tasks into a shared result (ex: error logging counter and debugging)
    -  mutable variable that updates value via transformations and sends value to driver node in an efficient manner


In [0]:
my_collection = "Corporate Training for - Spark, Scala, Python".split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)

### _Broadcast Example_:

In [0]:
supplementalData = {"ATT":1000, "Corporate":200,
                    "Training":-300, "days":100}

In [0]:
type(supplementalData)

dict

In [0]:
suppBroadcast = spark.sparkContext.broadcast(supplementalData)

In [0]:
suppBroadcast.value

{'ATT': 1000, 'Corporate': 200, 'Training': -300, 'days': 100}

In [0]:
type(suppBroadcast)

pyspark.core.broadcast.Broadcast

In [0]:
words.collect()

['Corporate', 'Training', 'for', '-', 'Spark,', 'Scala,', 'Python']

In [0]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0))).collect()

[('Corporate', 200),
 ('Training', -300),
 ('for', 0),
 ('-', 0),
 ('Spark,', 0),
 ('Scala,', 0),
 ('Python', 0)]

In [0]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0))).collect()

[('Corporate', 200),
 ('Training', -300),
 ('for', 0),
 ('-', 0),
 ('Spark,', 0),
 ('Scala,', 0),
 ('Python', 0)]