# Chapter 4: Working with Key/Value Pairs (Scala)

In [44]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("Working Key/Value Pairs").master("local[*]").getOrCreate()
val sc = spark.sparkContext

spark = org.apache.spark.sql.SparkSession@645ae311
sc = org.apache.spark.SparkContext@6c8c5913


## Creating Pair RDDs

Using `map()`

In [45]:
import scala.math.pow

In [46]:
val numericRdd = sc.parallelize(List(1,4,2,4,1,3,3))
val pairRdd = numericRdd.map(x => (x, pow(x, 2).toInt))

numericRdd = ParallelCollectionRDD[72] at parallelize at <console>:35
pairRdd = MapPartitionsRDD[73] at map at <console>:36


MapPartitionsRDD[73] at map at <console>:36

In [47]:
println("Pair RDD from map(): " + pairRdd.collect().mkString(", "))

Pair RDD from map(): (1,1), (4,16), (2,4), (4,16), (1,1), (3,9), (3,9)


## Transformations on one Pair RDDs

`reduceByKey()`, `mapValues()`

In [48]:
val sumValues = pairRdd.reduceByKey(_ + _)
println("Sum values using reduceByKey(): " + sumValues.collect().mkString(", "))

Sum values using reduceByKey(): (4,32), (1,2), (2,4), (3,18)


sumValues = ShuffledRDD[74] at reduceByKey at <console>:38


ShuffledRDD[74] at reduceByKey at <console>:38

In [49]:
val avgRedByKey = pairRdd.map(x => (x._1, (x._2, 1))).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).mapValues(x => x._1/x._2)
println("Average by key using reduceByKey(): " + avgRedByKey.collect().mkString(", "))

Average by key using reduceByKey(): (4,16), (1,1), (2,4), (3,9)


avgRedByKey = MapPartitionsRDD[77] at mapValues at <console>:38


MapPartitionsRDD[77] at mapValues at <console>:38

In [50]:
val lines = sc.textFile("../data/README.md")
val words = lines.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
print("Word count using reduceByKey(): " + words.take(10).mkString(", "))

Word count using reduceByKey(): (package,1), (this,1), (Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version),1), (Because,1), (Python,2), (page](http://spark.apache.org/documentation.html).,1), (cluster.,1), (its,1), ([run,1), (general,3)

lines = ../data/README.md MapPartitionsRDD[79] at textFile at <console>:36
words = ShuffledRDD[82] at reduceByKey at <console>:37


ShuffledRDD[82] at reduceByKey at <console>:37

`groupByKey()`

In [51]:
val groupedValues = pairRdd.groupByKey()
println("Grouped RDD using groupByKey(): " + groupedValues.collect().mkString(", "))

Grouped RDD using groupByKey(): (4,CompactBuffer(16, 16)), (1,CompactBuffer(1, 1)), (2,CompactBuffer(4)), (3,CompactBuffer(9, 9))


groupedValues = ShuffledRDD[83] at groupByKey at <console>:38


ShuffledRDD[83] at groupByKey at <console>:38

`combineByKey()`

In [52]:
pairRdd.collect()

[(1,1), (4,16), (2,4), (4,16), (1,1), (3,9), (3,9)]

In [53]:
val sumKeyValues = pairRdd.combineByKey((value: Int) => (value, 1),
                                        (acc:(Int, Int), value: Int) => (acc._1 + value, acc._2 + 1),
                                        (acc1:(Int, Int), acc2:(Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2))

val avgComByKey = sumKeyValues.mapValues({case(x, y) => x.toFloat/y})

print("Average by key using combineByKey(): " + avgComByKey.collect().mkString(", "))

Average by key using combineByKey(): (4,16.0), (1,1.0), (2,4.0), (3,9.0)

sumKeyValues = ShuffledRDD[84] at combineByKey at <console>:39
avgComByKey = MapPartitionsRDD[85] at mapValues at <console>:43


MapPartitionsRDD[85] at mapValues at <console>:43

`flatMapValues()`

In [54]:
println("RDD using flatMapValues(): " + pairRdd.flatMapValues(x => (1 to x)).take(10).mkString(", "))

RDD using flatMapValues(): (1,1), (4,1), (4,2), (4,3), (4,4), (4,5), (4,6), (4,7), (4,8), (4,9)


`keys()`

In [55]:
println("Get keys from key/pair RDD using keys(): " + pairRdd.keys.collect().mkString(", "))

Get keys from key/pair RDD using keys(): 1, 4, 2, 4, 1, 3, 3


`values()`

In [56]:
println("Get values from key/pair RDD using keys(): " + pairRdd.values.collect().mkString(", "))

Get values from key/pair RDD using keys(): 1, 16, 4, 16, 1, 9, 9


`sortByKey()`

In [57]:
val rddSort = sc.parallelize(List((4, (8, 2)), (1, (3, 1, 9))))

ParallelCollectionRDD[89] at parallelize at <console>:33

rddSort = ParallelCollectionRDD[89] at parallelize at <console>:33


In [58]:
println("Get RDD sorted by keys using sortByKey(): " + rddSort.sortByKey().collect().mkString(", "))

Get RDD sorted by keys using sortByKey(): (1,(3,1,9)), (4,(8,2))


## Transformations on two Pair RDDs

In [59]:
val pairRdd1 = sc.parallelize(List((3, 'A'), (2, 'J'), (5, 'K')))
val pairRdd2 = sc.parallelize(List((5, 'Z'), (3, 'W'), (7, 'B')))

pairRdd1 = ParallelCollectionRDD[93] at parallelize at <console>:33
pairRdd2 = ParallelCollectionRDD[94] at parallelize at <console>:34


ParallelCollectionRDD[94] at parallelize at <console>:34

`subtractByKey()`

In [60]:
val subtractRdd = pairRdd1.subtract(pairRdd2)
println("RDD from subtractByKey(): " + subtractRdd.collect().mkString(", "))

RDD from subtractByKey(): (3,A), (2,J), (5,K)


subtractRdd = MapPartitionsRDD[98] at subtract at <console>:38


MapPartitionsRDD[98] at subtract at <console>:38

`.join()`

In [61]:
val joinRdd = pairRdd1.join(pairRdd2)
println("RDD from join(): " + joinRdd.collect().mkString(", "))

RDD from join(): (5,(K,Z)), (3,(A,W))


joinRdd = MapPartitionsRDD[101] at join at <console>:38


MapPartitionsRDD[101] at join at <console>:38

`.leftOuterJoin()`

In [62]:
val leftOuterJoinRdd = pairRdd1.leftOuterJoin(pairRdd2)
println("RDD from leftOuterJoin(): " + leftOuterJoinRdd.collect().mkString(", "))

RDD from leftOuterJoin(): (5,(K,Some(Z))), (2,(J,None)), (3,(A,Some(W)))


leftOuterJoinRdd = MapPartitionsRDD[104] at leftOuterJoin at <console>:38


MapPartitionsRDD[104] at leftOuterJoin at <console>:38

`.rightOuterJoin()`

In [63]:
val rightOuterJoinRdd = pairRdd1.rightOuterJoin(pairRdd2)
println("RDD from rightOuterJoin(): " + rightOuterJoinRdd.collect().mkString(", "))

RDD from rightOuterJoin(): (5,(Some(K),Z)), (3,(Some(A),W)), (7,(None,B))


rightOuterJoinRdd = MapPartitionsRDD[107] at rightOuterJoin at <console>:38


MapPartitionsRDD[107] at rightOuterJoin at <console>:38

`.cogroup()`

In [64]:
val cogroupedRdd = pairRdd1.cogroup(pairRdd2)
println("RDD from cogroup(): " + cogroupedRdd.collect().mkString(", "))

RDD from cogroup(): (5,(CompactBuffer(K),CompactBuffer(Z))), (2,(CompactBuffer(J),CompactBuffer())), (3,(CompactBuffer(A),CompactBuffer(W))), (7,(CompactBuffer(),CompactBuffer(B)))


cogroupedRdd = MapPartitionsRDD[109] at cogroup at <console>:38


MapPartitionsRDD[109] at cogroup at <console>:38

## Actions Available on Pair RDDs

`countByKey()`

In [65]:
println("countByKey(): " + pairRdd.countByKey())

countByKey(): Map(4 -> 2, 1 -> 2, 2 -> 1, 3 -> 2)


`collectAsMap()`

In [66]:
println("collectAsMap(): " + pairRdd.collectAsMap())

collectAsMap(): Map(2 -> 4, 4 -> 16, 1 -> 1, 3 -> 9)


`lookup()`

In [67]:
println("lookup(4): " + pairRdd.lookup(4))

lookup(4): WrappedArray(16, 16)


## Partitions

`repartition()`

In [68]:
println("Repartition of an RDD: ")
pairRdd.repartition(2).glom().collect()

Repartition of an RDD: 


[[(1,1), (2,4), (4,16), (3,9)], [(4,16), (1,1), (3,9)]]

`partitionBy()`

In [69]:
import org.apache.spark.HashPartitioner

In [70]:
val myPartitioner = new HashPartitioner(2)

myPartitioner = org.apache.spark.HashPartitioner@2


org.apache.spark.HashPartitioner@2

In [71]:
print("Custom partitioning using partitionBy()")
pairRdd.partitionBy(myPartitioner).glom().collect()

Custom partitioning using partitionBy()

[[(4,16), (2,4), (4,16)], [(1,1), (1,1), (3,9), (3,9)]]