# Chapter 4: Joins (SQL and Core)

In this chapter, we will study joins in Spark, both in the Core and SQL API.

## Data

First, we create some RDD and DataFrames for the rest of the sections.

In [1]:
import org.apache.spark.sql.types.{StructField, StructType, 
                                   StringType, IntegerType, DoubleType}

import org.apache.spark.sql.Row

In [2]:
case class PersonAccount(id: Int, name: String, account_id: Int)

defined class PersonAccount


In [3]:
val peopleAccountsRdd = sc.parallelize(Array((1, ("John", 11)), (2, ("Isabelle", 22)), (3, ("Maria", 33)),
                                      (4, ("Peter", 44)), (5, ("Connor", 55)), (6, ("Max", 66))))

val peopleAccountsSchema = new StructType(Array(StructField("id", IntegerType, false),
                                                StructField("Name", StringType, false),
                                                StructField("account_id", IntegerType, false)))

val peopleAccountsDf = spark.createDataFrame(peopleAccountsRdd.map(x=> Row(x._1, x._2._1, x._2._2)),
                                             peopleAccountsSchema)

val peopleAccountsDs = spark.createDataset(peopleAccountsRdd.map(x=> PersonAccount(x._1, x._2._1, x._2._2)))

peopleAccountsRdd = ParallelCollectionRDD[0] at parallelize at <console>:33
peopleAccountsSchema = StructType(StructField(id,IntegerType,false), StructField(Name,StringType,false), StructField(account_id,IntegerType,false))
peopleAccountsDf = [id: int, Name: string ... 1 more field]
peopleAccountsDs = [id: int, name: string ... 1 more field]


[id: int, name: string ... 1 more field]

In [4]:
peopleAccountsRdd.take(2)

[(1,(John,11)), (2,(Isabelle,22))]

In [5]:
peopleAccountsDf.show()

+---+--------+----------+
| id|    Name|account_id|
+---+--------+----------+
|  1|    John|        11|
|  2|Isabelle|        22|
|  3|   Maria|        33|
|  4|   Peter|        44|
|  5|  Connor|        55|
|  6|     Max|        66|
+---+--------+----------+



In [6]:
case class PersonEmail(id: Int, email: String)

defined class PersonEmail


In [7]:
val peopleEmailsRdd = sc.parallelize(Array((1, "john@gmail.com"), (3, "maria@gmail.com"),
                                           (4, "peter@gmail.com"), (5, "connor@gmail.com")))

val peopleEmailsSchema = new StructType(Array(StructField("id", IntegerType, false),
                                              StructField("Email", StringType, false)))

val peopleEmailsDf = spark.createDataFrame(peopleEmailsRdd.map(x=> Row(x._1, x._2)),
                                           peopleEmailsSchema)

val peopleEmailsDs = spark.createDataFrame(peopleEmailsRdd.map(x=> PersonEmail(x._1, x._2)))

peopleEmailsRdd = ParallelCollectionRDD[8] at parallelize at <console>:33
peopleEmailsSchema = StructType(StructField(id,IntegerType,false), StructField(Email,StringType,false))
peopleEmailsDf = [id: int, Email: string]
peopleEmailsDs = [id: int, email: string]


[id: int, email: string]

In [8]:
peopleEmailsRdd.take(2)

[(1,john@gmail.com), (3,maria@gmail.com)]

In [9]:
peopleEmailsDf.show()

+---+----------------+
| id|           Email|
+---+----------------+
|  1|  john@gmail.com|
|  3| maria@gmail.com|
|  4| peter@gmail.com|
|  5|connor@gmail.com|
+---+----------------+



In [10]:
val accountsBalanceTypeRdd = sc.parallelize(Array((11, (152.0, 1)), (22, (3545.3, 2)), (33, (12.5, 1)),
                                                  (44, (75.0, 1)), (55, (4853.12, 2)), (66, (47.0, 1))))

val accountsBalanceTypeSchema = new StructType(Array(StructField("account_id", IntegerType, false),
                                                     StructField("balance", DoubleType, false),
                                                     StructField("account_type_id", IntegerType, false)))

val accountsBalanceTypeDf = spark.createDataFrame(accountsBalanceTypeRdd.map(x=> Row(x._1, x._2._1, x._2._2)),
                                                  accountsBalanceTypeSchema)

accountsBalanceTypeRdd = ParallelCollectionRDD[16] at parallelize at <console>:31
accountsBalanceTypeSchema = StructType(StructField(account_id,IntegerType,false), StructField(balance,DoubleType,false), StructField(account_type_id,IntegerType,false))
accountsBalanceTypeDf = [account_id: int, balance: double ... 1 more field]


[account_id: int, balance: double ... 1 more field]

In [11]:
accountsBalanceTypeRdd.take(2)

[(11,(152.0,1)), (22,(3545.3,2))]

In [12]:
accountsBalanceTypeDf.show()

+----------+-------+---------------+
|account_id|balance|account_type_id|
+----------+-------+---------------+
|        11|  152.0|              1|
|        22| 3545.3|              2|
|        33|   12.5|              1|
|        44|   75.0|              1|
|        55|4853.12|              2|
|        66|   47.0|              1|
+----------+-------+---------------+



In [13]:
val accountsTypeDescriptionRdd = sc.parallelize(Array((1, "Basic Account"), (2, "Premium Account")))

val accountsTypeDescriptionSchema = new StructType(Array(StructField("account_type_id", IntegerType, false),
                                                         StructField("account_description", StringType, false)))

val accountsTypeDescriptionDf = spark.createDataFrame(accountsTypeDescriptionRdd.map(x=> Row(x._1, x._2)),
                                                      accountsTypeDescriptionSchema)

accountsTypeDescriptionRdd = ParallelCollectionRDD[23] at parallelize at <console>:31
accountsTypeDescriptionSchema = StructType(StructField(account_type_id,IntegerType,false), StructField(account_description,StringType,false))
accountsTypeDescriptionDf = [account_type_id: int, account_description: string]


[account_type_id: int, account_description: string]

In [14]:
accountsTypeDescriptionRdd.take(2)

[(1,Basic Account), (2,Premium Account)]

In [15]:
accountsTypeDescriptionDf.show()

+---------------+-------------------+
|account_type_id|account_description|
+---------------+-------------------+
|              1|      Basic Account|
|              2|    Premium Account|
+---------------+-------------------+



## Core Spark Joins

We will start with joins of Key / Value RDDs. We can distinguish `join`, `leftOuterJoin` and `rightOuterJoin` joins.

`join()`

In [16]:
peopleAccountsRdd.join(peopleEmailsRdd).collect()

[(1,((John,11),john@gmail.com)), (3,((Maria,33),maria@gmail.com)), (4,((Peter,44),peter@gmail.com)), (5,((Connor,55),connor@gmail.com))]

In [17]:
peopleAccountsRdd.join(peopleEmailsRdd).map(x => (x._1, x._2._1._1, x._2._2)).collect()

[(1,John,john@gmail.com), (3,Maria,maria@gmail.com), (4,Peter,peter@gmail.com), (5,Connor,connor@gmail.com)]

`leftOuterJoin()`

In [18]:
peopleAccountsRdd.join(peopleEmailsRdd).map(x => (x._1, x._2._1._1, x._2._2)).collect()

[(1,John,john@gmail.com), (3,Maria,maria@gmail.com), (4,Peter,peter@gmail.com), (5,Connor,connor@gmail.com)]

In order to speed up join processes, specially if one same RDD is going to be joined several times, it is useful to pre-partition the RDDs.

Let's check if the `peopleAccountsRdd` has any partitioner.

In [19]:
println(peopleAccountsRdd.partitioner)

None


We can instantiate a partitioner, and use it to partition two RDDs that will bte joined several times afterwards.

In [20]:
import org.apache.spark.HashPartitioner

In [21]:
val partitioner = new HashPartitioner(2)

partitioner = org.apache.spark.HashPartitioner@2


org.apache.spark.HashPartitioner@2

In [22]:
val peopleAccountsPar = peopleAccountsRdd.partitionBy(partitioner)

peopleAccountsPar = ShuffledRDD[41] at partitionBy at <console>:34


ShuffledRDD[41] at partitionBy at <console>:34

In [23]:
println(peopleAccountsPar.partitioner)

Some(org.apache.spark.HashPartitioner@2)


In [24]:
val peopleEmailsPar = peopleAccountsRdd.partitionBy(partitioner)

peopleEmailsPar = ShuffledRDD[42] at partitionBy at <console>:34


ShuffledRDD[42] at partitionBy at <console>:34

We perform 10 different joins without any kind of pre-partition on the data.

In [25]:
for(idx <- 1 to 10) {
    val iniTime = System.currentTimeMillis()
    peopleAccountsRdd.join(peopleEmailsRdd).collect()
    val finTime = System.currentTimeMillis()
    println("Join time (ms): " + (finTime - iniTime))
}

Join time (ms): 97
Join time (ms): 79
Join time (ms): 59
Join time (ms): 74
Join time (ms): 72
Join time (ms): 79
Join time (ms): 78
Join time (ms): 97
Join time (ms): 106
Join time (ms): 123


In [26]:
for(idx <- 1 to 10) {
    val iniTime = System.currentTimeMillis()
    peopleAccountsPar.join(peopleEmailsPar).collect()
    val finTime = System.currentTimeMillis()
    println("Join time (ms): " + (finTime - iniTime))
}

Join time (ms): 90
Join time (ms): 39
Join time (ms): 47
Join time (ms): 59
Join time (ms): 44
Join time (ms): 54
Join time (ms): 38
Join time (ms): 47
Join time (ms): 60
Join time (ms): 44


As we can see, all the times are more or less the same. Now, we repeat the process but using pre-partition RDDs.

Finally, when joining large RDDs with small RDDs, it is quiet convinient to "broadcast" the small RDDs to all the executors. Let's see an example.

In [27]:
val bigRdd = accountsBalanceTypeRdd.map(x => (x._2._2, (x._1, x._2._1)))

bigRdd = MapPartitionsRDD[103] at map at <console>:32


MapPartitionsRDD[103] at map at <console>:32

In [28]:
bigRdd.collect()

[(1,(11,152.0)), (2,(22,3545.3)), (1,(33,12.5)), (1,(44,75.0)), (2,(55,4853.12)), (1,(66,47.0))]

In [29]:
val smallRddLocal = accountsTypeDescriptionRdd.collectAsMap()

smallRddLocal = Map(2 -> Premium Account, 1 -> Basic Account)


Map(2 -> Premium Account, 1 -> Basic Account)

In [30]:
smallRddLocal

Map(2 -> Premium Account, 1 -> Basic Account)

In [31]:
val smallRddLocalBcast = sc.broadcast(smallRddLocal)

smallRddLocalBcast = Broadcast(74)


Broadcast(74)

In [32]:
bigRdd.mapPartitions(iter => iter.flatMap{
    case(k, v1) => {
        smallRddLocalBcast.value.get(k) match {
            case Some(v2) => Seq((k, (v1, v2)))
            case None => Seq((k, (v1, null)))
            }
        }   
    }
).collect()

[(1,((11,152.0),Basic Account)), (2,((22,3545.3),Premium Account)), (1,((33,12.5),Basic Account)), (1,((44,75.0),Basic Account)), (2,((55,4853.12),Premium Account)), (1,((66,47.0),Basic Account))]

## Spark SQL 

Joining DataFrames using the SQL API is quiet simple and efficient. We can highlight the following joining modes: `inner`, `left_outer`, `right_outer`, `outer`, `left_semi` and `left_anti`. Let's see some examples of them

`inner`

In [33]:
peopleAccountsDf.join(peopleEmailsDf, Seq("id"), "inner").show()

+---+------+----------+----------------+
| id|  Name|account_id|           Email|
+---+------+----------+----------------+
|  1|  John|        11|  john@gmail.com|
|  3| Maria|        33| maria@gmail.com|
|  5|Connor|        55|connor@gmail.com|
|  4| Peter|        44| peter@gmail.com|
+---+------+----------+----------------+



`left_outer`

In [34]:
peopleAccountsDf.join(peopleEmailsDf, Seq("id"), "left_outer").show()

+---+--------+----------+----------------+
| id|    Name|account_id|           Email|
+---+--------+----------+----------------+
|  1|    John|        11|  john@gmail.com|
|  6|     Max|        66|            null|
|  3|   Maria|        33| maria@gmail.com|
|  5|  Connor|        55|connor@gmail.com|
|  4|   Peter|        44| peter@gmail.com|
|  2|Isabelle|        22|            null|
+---+--------+----------+----------------+



`rigth_outer`

In [35]:
peopleAccountsDf.join(peopleEmailsDf, Seq("id"), "right_outer").show()

+---+------+----------+----------------+
| id|  Name|account_id|           Email|
+---+------+----------+----------------+
|  1|  John|        11|  john@gmail.com|
|  3| Maria|        33| maria@gmail.com|
|  5|Connor|        55|connor@gmail.com|
|  4| Peter|        44| peter@gmail.com|
+---+------+----------+----------------+



`full`

In [36]:
peopleAccountsDf.join(peopleEmailsDf, Seq("id"), "full").show()

+---+--------+----------+----------------+
| id|    Name|account_id|           Email|
+---+--------+----------+----------------+
|  1|    John|        11|  john@gmail.com|
|  6|     Max|        66|            null|
|  3|   Maria|        33| maria@gmail.com|
|  5|  Connor|        55|connor@gmail.com|
|  4|   Peter|        44| peter@gmail.com|
|  2|Isabelle|        22|            null|
+---+--------+----------+----------------+



`left_semi`

In [37]:
peopleAccountsDf.join(peopleEmailsDf, Seq("id"), "left_semi").show()

+---+------+----------+
| id|  Name|account_id|
+---+------+----------+
|  1|  John|        11|
|  3| Maria|        33|
|  5|Connor|        55|
|  4| Peter|        44|
+---+------+----------+



`left_anti`

In [38]:
peopleAccountsDf.join(peopleEmailsDf, Seq("id"), "left_anti").show()

+---+--------+----------+
| id|    Name|account_id|
+---+--------+----------+
|  6|     Max|        66|
|  2|Isabelle|        22|
+---+--------+----------+



Finally, we can also make use of broadcast joins.

In [39]:
import org.apache.spark.sql.{functions => F}

import org.apache.spark.sql.{functions=>F}


In [40]:
accountsBalanceTypeDf.join(F.broadcast(accountsTypeDescriptionDf), "account_type_id").show()

+---------------+----------+-------+-------------------+
|account_type_id|account_id|balance|account_description|
+---------------+----------+-------+-------------------+
|              1|        11|  152.0|      Basic Account|
|              2|        22| 3545.3|    Premium Account|
|              1|        33|   12.5|      Basic Account|
|              1|        44|   75.0|      Basic Account|
|              2|        55|4853.12|    Premium Account|
|              1|        66|   47.0|      Basic Account|
+---------------+----------+-------+-------------------+



## Datasets Joins

Lastly, we can join also datasets using the `joinWith` function.

In [41]:
peopleAccountsDs.joinWith(peopleEmailsDs, peopleAccountsDs("id") === peopleEmailsDs("id")).show()

+---------------+--------------------+
|             _1|                  _2|
+---------------+--------------------+
|  [1, John, 11]| [1, john@gmail.com]|
| [3, Maria, 33]|[3, maria@gmail.com]|
|[5, Connor, 55]|[5, connor@gmail....|
| [4, Peter, 44]|[4, peter@gmail.com]|
+---------------+--------------------+

