## Spark RDD Broadcast variable example

In [3]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder.appName("SparkByExamples.com").master("local").getOrCreate()
val states = Map(("NY","New York"),("CA","California"),("FL","Florida"))
val countries = Map(("USA","United States of America"),("IN","India"))

val broadcastStates = spark.sparkContext.broadcast(states)
val broadcastCountries = spark.sparkContext.broadcast(countries)

val data = Seq(("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  )

val rdd = spark.sparkContext.parallelize(data)
val rdd2 = rdd.map(f => {
    val country = f._3
    val state = f._4
    val fullCountry = broadcastCountries.value.get(country).get
    val fullState = broadcastStates.value.get(state).get
    (f._1, f._2, fullCountry, fullState)
})

println(rdd2.collect().mkString("\n"))

(James,Smith,United States of America,California)
(Michael,Rose,United States of America,New York)
(Robert,Williams,United States of America,California)
(Maria,Jones,United States of America,Florida)


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@38132196
states: scala.collection.immutable.Map[String,String] = Map(NY -> New York, CA -> California, FL -> Florida)
countries: scala.collection.immutable.Map[String,String] = Map(USA -> United States of America, IN -> India)
broadcastStates: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Map[String,String]] = Broadcast(0)
broadcastCountries: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Map[String,String]] = Broadcast(1)
data: Seq[(String, String, String, String)] = List((James,Smith,USA,CA), (Michael,Rose,USA,NY), (Robert,Williams,USA,CA), (Maria,Jones,USA,FL))
rdd: org.apache.spark.rdd.RDD[(String, String, String, String)] = Paralle...


## Spark DataFrame Broadcast variable example

In [17]:
val spark = SparkSession.builder()
    .appName("SparkByExamples.com")
    .master("local")
    .getOrCreate()

val states = Map(("NY","New York"),("CA","California"),("FL","Florida"))
val countries = Map(("USA","United States of America"),("IN","India"))

val broadcastStates = spark.sparkContext.broadcast(states)
val broadcastCountries = spark.sparkContext.broadcast(countries)

val data = Seq(("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  )

val columns = Seq("firstname","lastname","country","state")

val df = data.toDF(columns:_*)

val df2 = df.map(row=>{
    val country = row.getString(2)
    val state = row.getString(3)

    val fullCountry = broadcastCountries.value.get(country).get
    val fullState = broadcastStates.value.get(state).get
    (row.getString(0),row.getString(1),fullCountry,fullState)
  }).toDF(columns:_*)

df2.show(false)

+---------+--------+------------------------+----------+
|firstname|lastname|country                 |state     |
+---------+--------+------------------------+----------+
|James    |Smith   |United States of America|California|
|Michael  |Rose    |United States of America|New York  |
|Robert   |Williams|United States of America|California|
|Maria    |Jones   |United States of America|Florida   |
+---------+--------+------------------------+----------+



spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@38132196
states: scala.collection.immutable.Map[String,String] = Map(NY -> New York, CA -> California, FL -> Florida)
countries: scala.collection.immutable.Map[String,String] = Map(USA -> United States of America, IN -> India)
broadcastStates: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Map[String,String]] = Broadcast(23)
broadcastCountries: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Map[String,String]] = Broadcast(24)
data: Seq[(String, String, String, String)] = List((James,Smith,USA,CA), (Michael,Rose,USA,NY), (Robert,Williams,USA,CA), (Maria,Jones,USA,FL))
columns: Seq[String] = List(firstname, lastname, country, state)
df: org.apache.spark.sql.DataFrame = [firstname...
