# Chapter 4: Working with Key/Value Pairs (Python)

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Key-Value-Pairs").master("local[*]").getOrCreate()
sc = spark.sparkContext

## Creating Pair RDDs

Using `map()`

In [2]:
numeric_rdd = sc.parallelize([1,4,2,4,1,3,3])
pair_rdd = numeric_rdd.map(lambda x: (x, x**2))

In [3]:
print("Pair RDD from map(): {0}".format(pair_rdd.collect()))

Pair RDD from map(): [(1, 1), (4, 16), (2, 4), (4, 16), (1, 1), (3, 9), (3, 9)]


## Transformations on one Pair RDDs

`reduceByKey()`, `mapValues()`

In [4]:
sum_values = pair_rdd.reduceByKey(lambda x, y: x+y)
print("Sum values using reduceByKey(): {0}".format(sum_values.collect()))

Sum values using reduceByKey(): [(4, 32), (1, 2), (2, 4), (3, 18)]


In [5]:
avg_red_by_key = pair_rdd.map(lambda x: (x[0], (x[1], 1)))\
.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).mapValues(lambda x: x[0]/x[1]).collect()

print("Average by key using reduceByKey(): {0}".format(avg_red_by_key))

Average by key using reduceByKey(): [(4, 16.0), (1, 1.0), (2, 4.0), (3, 9.0)]


In [6]:
lines = sc.textFile("../data/README.md")
words = lines.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1))
words_count = words.reduceByKey(lambda x, y: x + y)
print("Word count using reduceByKey(): {0}".format(words_count.take(10)))

Word count using reduceByKey(): [('#', 1), ('Apache', 1), ('Spark', 16), ('', 71), ('is', 6), ('It', 2), ('provides', 1), ('high-level', 1), ('APIs', 1), ('in', 6)]


`groupByKey()`

In [7]:
grouped_values = pair_rdd.groupByKey()
print("Grouped RDD using groupByKey(): {0}".format(grouped_values.groupByKey().collect()))

Grouped RDD using groupByKey(): [(4, <pyspark.resultiterable.ResultIterable object at 0x7f47c710b518>), (1, <pyspark.resultiterable.ResultIterable object at 0x7f47c710b630>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f47c710bcc0>), (3, <pyspark.resultiterable.ResultIterable object at 0x7f47c710b400>)]


`combineByKey()`

In [8]:
sum_key_values = pair_rdd.combineByKey(lambda value: (value, 1), 
                                      (lambda acc, value: (acc[0] + value, acc[1] + 1)), 
                                      (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))

avg_com_by_key = sum_key_values.map(lambda x: (x[0], x[1][0]/x[1][1])).collect()

print("Average by key using combineByKey(): {0}".format(avg_com_by_key))

Average by key using combineByKey(): [(4, 16.0), (1, 1.0), (2, 4.0), (3, 9.0)]


`flatMapValues()`

In [9]:
print("RDD using flatMapValues(): {0}".format(pair_rdd.flatMapValues(lambda x: (list(range(int(x/2))))).take(10)))

RDD using flatMapValues(): [(4, 0), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (2, 0), (2, 1)]


`keys()`

In [10]:
print("Get keys from key/pair RDD using keys(): {0}".format(pair_rdd.keys().collect()))

Get keys from key/pair RDD using keys(): [1, 4, 2, 4, 1, 3, 3]


`values()`

In [11]:
print("Get values from key/pair RDD using values(): {0}".format(pair_rdd.values().collect()))

Get values from key/pair RDD using values(): [1, 16, 4, 16, 1, 9, 9]


`sortByKey()`

In [12]:
rdd_sort = sc.parallelize([(4, (8, 2)), (1, (3, 1, 9))])

In [13]:
print("Get RDD sorted by keys using sortByKey(): {0}".format(rdd_sort.sortByKey().collect()))

Get RDD sorted by keys using sortByKey(): [(1, (3, 1, 9)), (4, (8, 2))]


## Transformations on two Pair RDDs

In [14]:
pair_rdd_1 = sc.parallelize([(3, 'A'), (2, 'J'), (5, 'K')]) 
pair_rdd_2 = sc.parallelize([(5, 'Z'), (3, 'W'), (7, 'B')]) 

`subtractByKey()`

In [15]:
subtract_rdd = pair_rdd_1.subtractByKey(pair_rdd_2)
print("RDD from subtractByKey(): {0}".format(subtract_rdd.collect()))

RDD from subtractByKey(): [(2, 'J')]


`.join()`

In [16]:
inner_join_rdd = pair_rdd_1.join(pair_rdd_2)
print("Inner join: {0}".format(inner_join_rdd.collect()))

Inner join: [(3, ('A', 'W')), (5, ('K', 'Z'))]


`.leftOuterJoin()`

In [17]:
left_outer_join_rdd = pair_rdd_1.leftOuterJoin(pair_rdd_2)
print("Left outer join: {0}".format(left_outer_join_rdd.collect()))

Left outer join: [(2, ('J', None)), (3, ('A', 'W')), (5, ('K', 'Z'))]


`.rightOuterJoin()`

In [18]:
right_outer_join_rdd = pair_rdd_1.rightOuterJoin(pair_rdd_2)
print("Right outer join: {0}".format(right_outer_join_rdd.collect()))

Right outer join: [(3, ('A', 'W')), (5, ('K', 'Z')), (7, (None, 'B'))]


`.cogroup()`

In [19]:
cogroup_rdd = pair_rdd_1.cogroup(pair_rdd_2)
print("Cogrouped RDD: {0}".format(cogroup_rdd.collect()))

Cogrouped RDD: [(2, (<pyspark.resultiterable.ResultIterable object at 0x7f47c6f99c18>, <pyspark.resultiterable.ResultIterable object at 0x7f47c6f99c88>)), (3, (<pyspark.resultiterable.ResultIterable object at 0x7f47c6f99eb8>, <pyspark.resultiterable.ResultIterable object at 0x7f47c70fefd0>)), (5, (<pyspark.resultiterable.ResultIterable object at 0x7f47c6f99e10>, <pyspark.resultiterable.ResultIterable object at 0x7f47c70fe550>)), (7, (<pyspark.resultiterable.ResultIterable object at 0x7f47c6f99da0>, <pyspark.resultiterable.ResultIterable object at 0x7f47c70fe128>))]


## Actions Available on Pair RDDs

`countByKey()`

In [20]:
print("countByKey(): {0}".format(pair_rdd.countByKey()))

countByKey(): defaultdict(<class 'int'>, {1: 2, 4: 2, 2: 1, 3: 2})


`collectAsMap()`

In [21]:
print("collectAsMap(): {0}".format(pair_rdd.collectAsMap()))

collectAsMap(): {1: 1, 4: 16, 2: 4, 3: 9}


`lookup()`

In [22]:
print("lookup(4): {0}".format(pair_rdd.lookup(4)))

lookup(4): [16, 16]


## Partitions

`repartition()`

In [23]:
print("Repartition of an RDD: {0}".format(pair_rdd.repartition(2).glom().collect()))

Repartition of an RDD: [[(1, 1), (4, 16), (1, 1), (3, 9), (3, 9)], [(4, 16), (2, 4)]]


`partitionBy()`

In [24]:
print("Custom partition using partitionBy(): {0}".format(pair_rdd.partitionBy(2, lambda x: int(x>2)).glom().collect()))

Custom partition using partitionBy(): [[(1, 1), (2, 4), (1, 1)], [(4, 16), (4, 16), (3, 9), (3, 9)]]
