# Chapter 4: Working with Key/Value Pairs (Scala)

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("Working Key/Value Pairs").master("local[*]").getOrCreate()
val sc = spark.sparkContext

spark = org.apache.spark.sql.SparkSession@767b462b
sc = org.apache.spark.SparkContext@189c7db3


## Creating Pair RDDs

Using `map()`

In [2]:
import scala.math.pow

In [3]:
val numericRdd = sc.parallelize(List(1,4,2,4,1,3,3))
val pairRdd = numericRdd.map(x => (x, pow(x, 2).toInt))

numericRdd = ParallelCollectionRDD[0] at parallelize at <console>:30
pairRdd = MapPartitionsRDD[1] at map at <console>:31


MapPartitionsRDD[1] at map at <console>:31

In [4]:
println("Pair RDD from map(): " + pairRdd.collect().mkString(", "))

Pair RDD from map(): (1,1), (4,16), (2,4), (4,16), (1,1), (3,9), (3,9)


## Transformations on one Pair RDDs

`reduceByKey()`, `mapValues()`

In [5]:
val sumValues = pairRdd.reduceByKey(_ + _)
println("Sum values using reduceByKey(): " + sumValues.collect().mkString(", "))

Sum values using reduceByKey(): (4,32), (1,2), (2,4), (3,18)


sumValues = ShuffledRDD[2] at reduceByKey at <console>:33


ShuffledRDD[2] at reduceByKey at <console>:33

In [6]:
val avgRedByKey = pairRdd.map(x => (x._1, (x._2, 1))).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).mapValues(x => x._1/x._2)
println("Average by key using reduceByKey(): " + avgRedByKey.collect().mkString(", "))

Average by key using reduceByKey(): (4,16), (1,1), (2,4), (3,9)


avgRedByKey = MapPartitionsRDD[5] at mapValues at <console>:33


MapPartitionsRDD[5] at mapValues at <console>:33

In [7]:
val lines = sc.textFile("../data/README.md")
val words = lines.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
print("Word count using reduceByKey(): " + words.take(10).mkString(", "))

Word count using reduceByKey(): (package,1), (this,1), (Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version),1), (Because,1), (Python,2), (page](http://spark.apache.org/documentation.html).,1), (cluster.,1), (its,1), ([run,1), (general,3)

lines = ../data/README.md MapPartitionsRDD[7] at textFile at <console>:30
words = ShuffledRDD[10] at reduceByKey at <console>:31


ShuffledRDD[10] at reduceByKey at <console>:31

`groupByKey()`

In [8]:
val groupedValues = pairRdd.groupByKey()
println("Grouped RDD using groupByKey(): " + groupedValues.collect().mkString(", "))

Grouped RDD using groupByKey(): (4,CompactBuffer(16, 16)), (1,CompactBuffer(1, 1)), (2,CompactBuffer(4)), (3,CompactBuffer(9, 9))


groupedValues = ShuffledRDD[11] at groupByKey at <console>:33


ShuffledRDD[11] at groupByKey at <console>:33

`combineByKey()`

In [9]:
pairRdd.collect()

[(1,1), (4,16), (2,4), (4,16), (1,1), (3,9), (3,9)]

In [10]:
val sumKeyValues = pairRdd.combineByKey((value: Int) => (value, 1),
                                        (acc:(Int, Int), value: Int) => (acc._1 + value, acc._2 + 1),
                                        (acc1:(Int, Int), acc2:(Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2))

val avgComByKey = sumKeyValues.mapValues({case(x, y) => x.toFloat/y})

print("Average by key using combineByKey(): " + avgComByKey.collect().mkString(", "))

Average by key using combineByKey(): (4,16.0), (1,1.0), (2,4.0), (3,9.0)

sumKeyValues = ShuffledRDD[12] at combineByKey at <console>:33
avgComByKey = MapPartitionsRDD[13] at mapValues at <console>:37


MapPartitionsRDD[13] at mapValues at <console>:37

`flatMapValues()`

In [11]:
println("RDD using flatMapValues(): " + pairRdd.flatMapValues(x => (1 to x)).take(10).mkString(", "))

RDD using flatMapValues(): (1,1), (4,1), (4,2), (4,3), (4,4), (4,5), (4,6), (4,7), (4,8), (4,9)


`keys()`

In [12]:
println("Get keys from key/pair RDD using keys(): " + pairRdd.keys.collect().mkString(", "))

Get keys from key/pair RDD using keys(): 1, 4, 2, 4, 1, 3, 3


`values()`

In [13]:
println("Get values from key/pair RDD using keys(): " + pairRdd.values.collect().mkString(", "))

Get values from key/pair RDD using keys(): 1, 16, 4, 16, 1, 9, 9


`sortByKey()`

In [14]:
val rddSort = sc.parallelize(List((4, (8, 2)), (1, (3, 1, 9))))

rddSort = ParallelCollectionRDD[17] at parallelize at <console>:30


ParallelCollectionRDD[17] at parallelize at <console>:30

In [15]:
println("Get RDD sorted by keys using sortByKey(): " + rddSort.sortByKey().collect().mkString(", "))

Get RDD sorted by keys using sortByKey(): (1,(3,1,9)), (4,(8,2))
