## RDD Basics

In [1]:
// in Scala: converts a Dataset[Long] to RDD[Long]
spark.range(500).rdd

Intitializing Scala interpreter ...

Spark Web UI available at http://bae5ef2081fd:4042
SparkContext available as 'sc' (version = 3.0.0-preview2, master = local[*], app id = local-1620438466172)
SparkSession available as 'spark'


res0: org.apache.spark.rdd.RDD[Long] = MapPartitionsRDD[5] at rdd at <console>:27


In [2]:
// in Scala
spark.range(10).toDF().rdd.map(rowObject => rowObject.getLong(0))


res1: org.apache.spark.rdd.RDD[Long] = MapPartitionsRDD[12] at map at <console>:27


In [3]:
// in Scala
spark.range(10).rdd.toDF()


res2: org.apache.spark.sql.DataFrame = [value: bigint]


In [4]:
// in Scala
val myCollection = "Spark: Big Data Processing Made Simple"
  .split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)


myCollection: Array[String] = Array(Spark:, Big, Data, Processing, Made, Simple)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[19] at parallelize at <console>:27


In [5]:
// in Scala
words.setName("myWords")
words.name // myWords

res3: String = myWords


In [6]:
//spark.sparkContext.textFile("/some/path/withTextFiles")

In [7]:
//spark.sparkContext.wholeTextFiles("/some/path/withTextFiles")

In [8]:
words.distinct().count()

res6: Long = 6


In [9]:
// in Scala
def startsWithS(individual:String) = {
  individual.startsWith("S")
}

startsWithS: (individual: String)Boolean


In [10]:
// in Scala
words.filter(word => startsWithS(word)).collect()

res7: Array[String] = Array(Spark:, Simple)


In [11]:
// in Scala
val words2 = words.map(word => (word, word(0), word.startsWith("S")))

words2: org.apache.spark.rdd.RDD[(String, Char, Boolean)] = MapPartitionsRDD[24] at map at <console>:27


In [12]:
// in Scala
words2.filter(record => record._3).take(5)

res8: Array[(String, Char, Boolean)] = Array((Spark:,S,true), (Simple,S,true))


In [13]:
// in Scala
words.flatMap(word => word.toSeq).take(5)

res9: Array[Char] = Array(S, p, a, r, k)


In [14]:
// in Scala
words.sortBy(word => word.length() * -1).take(2)

res10: Array[String] = Array(Processing, Spark:)


In [15]:
// in Scala
val fiftyFiftySplit = words.randomSplit(Array[Double](0.5, 0.5))

fiftyFiftySplit: Array[org.apache.spark.rdd.RDD[String]] = Array(MapPartitionsRDD[32] at randomSplit at <console>:27, MapPartitionsRDD[33] at randomSplit at <console>:27)


In [16]:
// in Scala
spark.sparkContext.parallelize(1 to 20).reduce(_ + _) // 210

res11: Int = 210


In [17]:
// in Scala
def wordLengthReducer(leftWord:String, rightWord:String): String = {
  if (leftWord.length > rightWord.length)
    return leftWord
  else
    return rightWord
}

words.reduce(wordLengthReducer)

wordLengthReducer: (leftWord: String, rightWord: String)String
res12: String = Processing


In [18]:
words.count()


res13: Long = 6


In [19]:
val confidence = 0.95
val timeoutMilliseconds = 400
words.countApprox(timeoutMilliseconds, confidence)


confidence: Double = 0.95
timeoutMilliseconds: Int = 400
res14: org.apache.spark.partial.PartialResult[org.apache.spark.partial.BoundedDouble] = (final: [6.000, 6.000])


In [20]:
words.countApproxDistinct(0.05)


res15: Long = 6


In [21]:
words.countApproxDistinct(4, 10)


res16: Long = 6


In [22]:
words.countByValue()


res17: scala.collection.Map[String,Long] = Map(Simple -> 1, Processing -> 1, Spark: -> 1, Made -> 1, Big -> 1, Data -> 1)


In [23]:
words.countByValueApprox(1000, 0.95)


res18: org.apache.spark.partial.PartialResult[scala.collection.Map[String,org.apache.spark.partial.BoundedDouble]] = (final: Map(Simple -> [1.000, 1.000], Processing -> [1.000, 1.000], Spark: -> [1.000, 1.000], Made -> [1.000, 1.000], Big -> [1.000, 1.000], Data -> [1.000, 1.000]))


In [24]:
words.first()


res19: String = Spark:


In [25]:
spark.sparkContext.parallelize(1 to 20).max()
spark.sparkContext.parallelize(1 to 20).min()


res20: Int = 1


In [26]:
words.take(5)
words.takeOrdered(5)
words.top(5)
val withReplacement = true
val numberToTake = 6
val randomSeed = 100L
words.takeSample(withReplacement, numberToTake, randomSeed)


withReplacement: Boolean = true
numberToTake: Int = 6
randomSeed: Long = 100
res21: Array[String] = Array(Big, Simple, Made, Big, Made, Big)


In [73]:
words.saveAsTextFile("file:/tmp/SparkmyFile")

In [74]:
// in Scala
import org.apache.hadoop.io.compress.BZip2Codec
words.saveAsTextFile("file:/tmp/SparkCompressed", classOf[BZip2Codec])

import org.apache.hadoop.io.compress.BZip2Codec


In [76]:
words.saveAsObjectFile("/tmp/sequenceFilePath")

In [77]:
words.cache()

res67: words.type = myWords ParallelCollectionRDD[77] at parallelize at <console>:30


In [78]:
// in Scala
words.getStorageLevel

res68: org.apache.spark.storage.StorageLevel = StorageLevel(memory, deserialized, 1 replicas)


In [79]:
spark.sparkContext.setCheckpointDir("/tmp/checkpointing")
words.checkpoint()

In [81]:
words.pipe("wc -l").collect()

res71: Array[String] = Array(3, 3)


In [82]:
// in Scala
words.mapPartitions(part => Iterator[Int](1)).sum() // 2

res72: Double = 2.0


In [83]:
// in Scala
def indexedFunc(partitionIndex:Int, withinPartIterator: Iterator[String]) = {
  withinPartIterator.toList.map(
    value => s"Partition: $partitionIndex => $value").iterator
}
words.mapPartitionsWithIndex(indexedFunc).collect()

indexedFunc: (partitionIndex: Int, withinPartIterator: Iterator[String])Iterator[String]
res73: Array[String] = Array(Partition: 0 => Spark:, Partition: 0 => Big, Partition: 0 => Data, Partition: 1 => Processing, Partition: 1 => Made, Partition: 1 => Simple)


In [84]:
words.foreachPartition { iter =>
  import java.io._
  import scala.util.Random
  val randomFileName = new Random().nextInt()
  val pw = new PrintWriter(new File(s"/tmp/random-file-${randomFileName}.txt"))
  while (iter.hasNext) {
      pw.write(iter.next())
  }
  pw.close()
}


In [85]:
// in Scala
spark.sparkContext.parallelize(Seq("Hello", "World"), 2).glom().collect()
// Array(Array(Hello), Array(World))

res75: Array[Array[String]] = Array(Array(Hello), Array(World))
