## Spark RDD Shuffle

In [8]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDD

val spark:SparkSession = SparkSession.builder()
    .master("local[5]")
    .appName("SparkByExamples.com")
    .getOrCreate()

val sc = spark.sparkContext

val rdd:RDD[String] = sc.textFile("../resources/test.txt")

println("RDD Parition Count :"+rdd.getNumPartitions)
val rdd2 = rdd.flatMap(f=>f.split(" "))
  .map(m=>(m,1))

//ReduceBy transformation
val rdd5 = rdd2.reduceByKey(_ + _)

println("RDD Parition Count :"+rdd5.getNumPartitions)

RDD Parition Count :2
RDD Parition Count :2


import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDD
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@319faa77
sc: org.apache.spark.SparkContext = org.apache.spark.SparkContext@4e0fcd9
rdd: org.apache.spark.rdd.RDD[String] = ../resources/test.txt MapPartitionsRDD[13] at textFile at <console>:45
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[15] at map at <console>:49
rdd5: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[16] at reduceByKey at <console>:52


## Spark SQL DataFrame Shuffle

In [9]:
import spark.implicits._

val simpleData = Seq(("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  )
val df = simpleData.toDF("employee_name","department","state","salary","age","bonus")

val df2 = df.groupBy("state").count()

println(df2.rdd.getNumPartitions)

200


import spark.implicits._
simpleData: Seq[(String, String, String, Int, Int, Int)] = List((James,Sales,NY,90000,34,10000), (Michael,Sales,NY,86000,56,20000), (Robert,Sales,CA,81000,30,23000), (Maria,Finance,CA,90000,24,23000), (Raman,Finance,CA,99000,40,24000), (Scott,Finance,NY,83000,36,19000), (Jen,Finance,NY,79000,53,15000), (Jeff,Marketing,CA,80000,25,18000), (Kumar,Marketing,NY,91000,50,21000))
df: org.apache.spark.sql.DataFrame = [employee_name: string, department: string ... 4 more fields]
df2: org.apache.spark.sql.DataFrame = [state: string, count: bigint]


## can change default shuffle partition value using conf method of the SparkSession

In [13]:
spark.conf.set("spark.sql.shuffle.partitions",100)
println(df.groupBy("state").count().rdd.partitions.length)

100
