# Chapter 3: Programming with RDDs (Scala)

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("Programming with RDDs").master("local[*]").getOrCreate()
val sc = spark.sparkContext

spark = org.apache.spark.sql.SparkSession@7655a3a5
sc = org.apache.spark.SparkContext@5885970d


## Create RDD

Create RDD from list:

In [2]:
val numeric_rdd = sc.parallelize(1 to 10)

numeric_rdd = ParallelCollectionRDD[0] at parallelize at <console>:29


ParallelCollectionRDD[0] at parallelize at <console>:29

In [3]:
println("Numeric RDD (from list): " + numeric_rdd.collect().mkString(", "))

Numeric RDD (from list): 1, 2, 3, 4, 5, 6, 7, 8, 9, 10


Create RDD from external file:

In [4]:
val text_rdd = sc.textFile("../data/README.md")

text_rdd = ../data/README.md MapPartitionsRDD[2] at textFile at <console>:29


../data/README.md MapPartitionsRDD[2] at textFile at <console>:29

In [5]:
println("Text RDD (from external file): " + text_rdd.take(10).mkString(", "))

Text RDD (from external file): # Apache Spark, , Spark is a fast and general cluster computing system for Big Data. It provides, high-level APIs in Scala, Java, Python, and R, and an optimized engine that, supports general computation graphs for data analysis. It also supports a, rich set of higher-level tools including Spark SQL for SQL and DataFrames,, MLlib for machine learning, GraphX for graph processing,, and Spark Streaming for stream processing., , <http://spark.apache.org/>


## RDD actions

`collect()`, `take()`: --> previous section 

In [6]:
val rdd1 = sc.parallelize(List(1,1,2,3,33,1,4,5,8,6))
val rdd2 = sc.parallelize(List(1,2,9,8))

rdd1 = ParallelCollectionRDD[3] at parallelize at <console>:29
rdd2 = ParallelCollectionRDD[4] at parallelize at <console>:30


ParallelCollectionRDD[4] at parallelize at <console>:30

`count()`, `countByValue()`

In [7]:
println("count(): " + rdd1.count())
println("countByValue(): " + rdd1.countByValue())

count(): 10
countByValue(): Map(5 -> 1, 1 -> 3, 6 -> 1, 33 -> 1, 2 -> 1, 3 -> 1, 8 -> 1, 4 -> 1)


`takeOrdered()`, `takeSample()`

In [8]:
println("takeOrdered(): " + rdd2.takeOrdered(3).mkString(", "))
println("takeSample(): " + rdd2.takeSample(false, 2).mkString(", "))

takeOrdered(): 1, 2, 8
takeSample(): 2, 9


`reduce()`, `fold()`

In [9]:
println("Sum of list using reduce(): " + rdd1.reduce(_ + _))
println("Sum of list using fold(): " + rdd1.fold(0)(_ + _))

Sum of list using reduce(): 64
Sum of list using fold(): 64


Calculating average using `reduce()`

In [10]:
println("Average calculated using reduce(): " + rdd1.reduce(_ + _).toFloat/rdd1.count())

Average calculated using reduce(): 6.4


Calculating average using `aggregate()`:

In [11]:
val (sum_values, count) = rdd1.aggregate((0,0))(
                                        (acc, value) => (acc._1 + value, acc._2 + 1),
                                        (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2))

val avg = sum_values.toFloat/count
println("Average calculated using aggregate(): " + avg)

Average calculated using aggregate(): 6.4


sum_values = 64
count = 10
avg = 6.4


6.4

## Basic RDD Transformations

`map()`

In [12]:
val rdd_map = numeric_rdd.map(_^2)
print("RDD obtained using map: " + rdd_map.collect().mkString(", "))

RDD obtained using map: 3, 0, 1, 6, 7, 4, 5, 10, 11, 8

rdd_map = MapPartitionsRDD[10] at map at <console>:31


MapPartitionsRDD[10] at map at <console>:31

`flatMap()`

In [13]:
val rdd_flat_map = text_rdd.flatMap(_.split(" "))
println("RDD obtained using flatMap: " + rdd_flat_map.take(10).mkString(", "))

RDD obtained using flatMap: #, Apache, Spark, , Spark, is, a, fast, and, general


rdd_flat_map = MapPartitionsRDD[11] at flatMap at <console>:31


MapPartitionsRDD[11] at flatMap at <console>:31

`filter()`

In [14]:
val lines_spark = text_rdd.map(_.split(" ")).filter(_.contains("Spark"))
println("Number of lines that contains the word 'Spark': " + lines_spark.count())

Number of lines that contains the word 'Spark': 16


lines_spark = MapPartitionsRDD[13] at filter at <console>:31


MapPartitionsRDD[13] at filter at <console>:31

In [15]:
val words_python = text_rdd.flatMap(_.split(" ")).filter(_.replace(",", "") == "Python")
println("Number of times that the word 'Python' appears: " + words_python.count())

Number of times that the word 'Python' appears: 4


words_python = MapPartitionsRDD[15] at filter at <console>:31


MapPartitionsRDD[15] at filter at <console>:31

`distinct()`

In [16]:
print("RDD from distinct(): " + rdd1.distinct().collect().mkString(", "))

RDD from distinct(): 4, 8, 33, 1, 5, 6, 2, 3

## Pseudo-Set Operations

`union()`

In [17]:
val rdd_union = rdd1.union(rdd2)
println("RDD from union(): " + rdd_union.collect().mkString(", "))

RDD from union(): 1, 1, 2, 3, 33, 1, 4, 5, 8, 6, 1, 2, 9, 8


rdd_union = UnionRDD[19] at union at <console>:32


UnionRDD[19] at union at <console>:32

`subtract()`

In [18]:
val rdd_subtract = rdd1.subtract(rdd2)
println("RDD from subtract(): " + rdd_subtract.collect().mkString(", "))

RDD from subtract(): 4, 33, 5, 6, 3


rdd_subtract = MapPartitionsRDD[23] at subtract at <console>:32


MapPartitionsRDD[23] at subtract at <console>:32

`intersection()`

In [19]:
val rdd_intersection = rdd1.intersection(rdd2)
println("RDD from intersection(): " + rdd_intersection.collect().mkString(", "))

RDD from intersection(): 8, 1, 2


rdd_intersection = MapPartitionsRDD[29] at intersection at <console>:32


MapPartitionsRDD[29] at intersection at <console>:32

`cartesian()`

In [20]:
val rdd_cartesian = rdd1.cartesian(rdd2)
println("RDD from cartesian(): " + rdd_cartesian.collect().mkString(", "))

RDD from cartesian(): (1,1), (1,1), (1,2), (1,2), (1,9), (1,9), (1,8), (1,8), (2,1), (3,1), (33,1), (2,2), (3,2), (33,2), (2,9), (3,9), (33,9), (2,8), (3,8), (33,8), (1,1), (4,1), (1,2), (4,2), (1,9), (4,9), (1,8), (4,8), (5,1), (8,1), (6,1), (5,2), (8,2), (6,2), (5,9), (8,9), (6,9), (5,8), (8,8), (6,8)


rdd_cartesian = CartesianRDD[30] at cartesian at <console>:32


CartesianRDD[30] at cartesian at <console>:32