# Chapter 8: Testing and Validation

As in any other programming field, testing and validation are essential in Spark data processing. Here we will see some exaples of how to perform that.

## General Spark Unit Testing

### Regular Spark Jobs (testing with RDDs)

In [1]:
case class Panda(Happiness: Double, Niceness: Double, Softness: Double, 
                 Sweetness: Double)

defined class Panda


In [2]:
val df = spark.createDataFrame(Seq(Panda(15.0, 0.25, 2467.0, 0.0),
                                   Panda(2.0, 1000, 35.4, 0.0),
                                   Panda(10.0, 2.0, 50.0, 0.0),
                                   Panda(3.0, 8.5, 0.2, 98.0)))

df = [Happiness: double, Niceness: double ... 2 more fields]


[Happiness: double, Niceness: double ... 2 more fields]

Test on DataFrame approx equality failed
Test on DataFrame approx equality failed


In [3]:
val expectedResults = Map(1 -> Set(3.0, 15.0), 2 -> Set(2.0, 1000.0), 3 -> Set(35.4, 2467.0), 4 -> Set(0.0, 98.0))

expectedResults = Map(1 -> Set(3.0, 15.0), 2 -> Set(2.0, 1000.0), 3 -> Set(35.4, 2467.0), 4 -> Set(0.0, 98.0))


Map(1 -> Set(3.0, 15.0), 2 -> Set(2.0, 1000.0), 3 -> Set(35.4, 2467.0), 4 -> Set(0.0, 98.0))

In [4]:
val rankIndexs = Array(2, 4)
var resultV0 = Map[Int, Iterable[Double]]()

for (idx <- 1 to df.schema.length) {
    
    val colData = df.rdd.map(row => row.getDouble(idx - 1))
    val sortedData = colData.sortBy(x => x).zipWithIndex()
    val ranksOnly = sortedData.filter(x => rankIndexs.contains(x._2 + 1)).map(_._1)
    
    resultV0 += (idx -> ranksOnly.collect().toSet)
    
}

rankIndexs = Array(2, 4)
resultV0 = Map(1 -> Set(3.0, 15.0), 2 -> Set(2.0, 1000.0), 3 -> Set(35.4, 2467.0), 4 -> Set(0.0, 98.0))


Map(1 -> Set(3.0, 15.0), 2 -> Set(2.0, 1000.0), 3 -> Set(35.4, 2467.0), 4 -> Set(0.0, 98.0))

In [5]:
assert(expectedResults == resultV0)

In [6]:
val rowLength = df.schema.length
val pairRDD = df.rdd.flatMap(row => Range(0, rowLength).map(idx => (idx, row.getDouble(idx))))
val resultV1 = pairRDD.groupByKey().map(x => (x._1, x._2.toArray.sorted.zipWithIndex
                                            .filter(y => rankIndexs.contains(y._2 + 1))
                                              .map(x => (x._1)))).map(x => (x._1+1, x._2.toSet)).collectAsMap()

rowLength = 4
pairRDD = MapPartitionsRDD[40] at flatMap at <console>:34
resultV1 = Map(2 -> Set(2.0, 1000.0), 4 -> Set(0.0, 98.0), 1 -> Set(3.0, 15.0), 3 -> Set(35.4, 2467.0))


Map(2 -> Set(2.0, 1000.0), 4 -> Set(0.0, 98.0), 1 -> Set(3.0, 15.0), 3 -> Set(35.4, 2467.0))

In [7]:
assert(expectedResults == resultV1)

In [8]:
val rowLength = df.schema.length
val pairRDD = df.rdd.flatMap(row => Range(0, rowLength).map(idx => (idx, row.getDouble(idx))))
val badResult = pairRDD.groupByKey().map(x => (x._1, x._2.toArray.sorted.zipWithIndex
                                               .filter(y => rankIndexs.contains(y._2 + 1))
                                               .map(x => (x._1)))).map(x => (x._1+2, x._2.toSet)).collectAsMap()

rowLength = 4
pairRDD = MapPartitionsRDD[44] at flatMap at <console>:37
badResult = Map(2 -> Set(3.0, 15.0), 5 -> Set(0.0, 98.0), 4 -> Set(35.4, 2467.0), 3 -> Set(2.0, 1000.0))


Map(2 -> Set(3.0, 15.0), 5 -> Set(0.0, 98.0), 4 -> Set(35.4, 2467.0), 3 -> Set(2.0, 1000.0))

In [9]:
assert(expectedResults == badResult)

Name: java.lang.AssertionError
Message: assertion failed
StackTrace:   at scala.Predef$.assert(Predef.scala:156)

### Streaming

TODO

## Mocking RDDs

### Testing DataFrames

In [10]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Row

lastException: Throwable = null


In [11]:
case class Person(id: Int, name:String, age:Int, balance:Double)
case class PersonSimplifed(id: Int, name:String, age:Int)

defined class Person
defined class PersonSimplifed


In [12]:
val expectedDf = spark.createDataFrame(Seq(Person(1, "John", 23, 145.2),
                                           Person(2, "Maria", 65, 248.3),
                                           Person(3, "Peter", 39, 458.3)))

expectedDf = [id: int, name: string ... 2 more fields]


[id: int, name: string ... 2 more fields]

In [13]:
val goodResultDf = spark.createDataFrame(Seq(Person(1, "John", 23, 145.2),
                                             Person(2, "Maria", 65, 248.3),
                                             Person(3, "Peter", 39, 458.3)))

goodResultDf = [id: int, name: string ... 2 more fields]


[id: int, name: string ... 2 more fields]

In [14]:
val badResultDf1 = spark.createDataFrame(Seq(Person(1, "John", 23, 145.2),
                                             Person(2, "Maria", 65, 248.3),
                                             Person(3, "Peter", 39, 45.3)))

badResultDf1 = [id: int, name: string ... 2 more fields]


[id: int, name: string ... 2 more fields]

In [15]:
val badResultDf2 = spark.createDataFrame(Seq(Person(1, "John", 23, 145.2),
                                             Person(2, "Maria", 37, 248.3),
                                             Person(3, "Peter", 39, 458.3)))

badResultDf2 = [id: int, name: string ... 2 more fields]


[id: int, name: string ... 2 more fields]

In [16]:
val badResultDf3 = spark.createDataFrame(Seq(PersonSimplifed(1, "John", 23),
                                             PersonSimplifed(2, "Maria", 65),
                                             PersonSimplifed(3, "Peter", 39)))

badResultDf3 = [id: int, name: string ... 1 more field]


[id: int, name: string ... 1 more field]

In [17]:
val badResultDf4 = spark.createDataFrame(Seq(Person(1, "John", 23, 145.2),
                                             Person(2, "Maria", 65, 248.3)))

badResultDf4 = [id: int, name: string ... 2 more fields]


[id: int, name: string ... 2 more fields]

In [18]:
/**
Check complete equality between two objects

@param a: first item to compare
@param b: second item to compare
@return: boolean indicating if the two objects are equal or not
**/

def checkEqualityItems(a: Any, b: Any): Boolean = {
    try {
        assert(a==b)
        true
            
    } catch {
        case a: java.lang.AssertionError => {println("Test on DataFrame approx equality failed"); false}
    }
    
}

/**
Checks approximate equality between two booleans. The approximation
is set by a tolerance, given as an input

@param a: first Double to compare
@param b: second Double to compare
@param tol: tolerance
@return: boolean indicating if the two numbers are equal or not
**/

def checkApproximateEqualityItems(a: Double, b: Double, tol: Double): Boolean = {
    try {
        assert(b <= a + tol)
        assert(b >= a - tol)
        true
            
    } catch {
        case a: java.lang.AssertionError => {println("Test on DataFrame approx equality failed"); false}
    }
    
}


/**
Check if two DataFrames are equal, considering a tolerance for Double items of the DataFrames

@param a: first DataFrame to compare
@param b: second DataFrame to compare
@param tol: tolerance
@return: boolean indicating if the two DataFrames are equal or not
**/

def assertDataFrameApproximateEquals(expectedDf: DataFrame, resultDf: DataFrame, tol: Double): Boolean = {
    
    try {
        
        // Check Schema
        assert(expectedDf.schema == resultDf.schema)
        
        // Check number of rows
        assert(expectedDf.rdd.count() == resultDf.rdd.count())
        
        // Check row content
        val rowContentCheck = expectedDf.rdd.zip(resultDf.rdd).flatMap(x => x._1.toSeq.zip(x._2.toSeq)).map{
    
            case (a: Int, b: Int) => checkEqualityItems(a, b)
            case (a: String, b: String) => checkEqualityItems(a, b)
            case (a: Double, b: Double) => checkApproximateEqualityItems(a, b, tol)

        }.filter(_ == false).collect().length
        assert(rowContentCheck == 0)
        
        true
        
    }
    
    catch {
        
        case a: java.lang.AssertionError => {println("Test on DataFrame approx equality failed"); false}
    
    }
}

checkEqualityItems: (a: Any, b: Any)Boolean
checkApproximateEqualityItems: (a: Double, b: Double, tol: Double)Boolean
assertDataFrameApproximateEquals: (expectedDf: org.apache.spark.sql.DataFrame, resultDf: org.apache.spark.sql.DataFrame, tol: Double)Boolean


In [19]:
assertDataFrameApproximateEquals(expectedDf, goodResultDf, 0.01)

true

In [20]:
assertDataFrameApproximateEquals(expectedDf, badResultDf1, 0.01)

Test on DataFrame approx equality failed


false

In [21]:
assertDataFrameApproximateEquals(expectedDf, badResultDf2, 0.01)

Test on DataFrame approx equality failed


false

In [22]:
assertDataFrameApproximateEquals(expectedDf, badResultDf3, 0.01)

Test on DataFrame approx equality failed


false

In [23]:
assertDataFrameApproximateEquals(expectedDf, badResultDf4, 0.01)

Test on DataFrame approx equality failed


false