simplify test code

scwf · Jan 10, 2015 · dd0d0e8 · dd0d0e8
1 parent c627de3
commit dd0d0e8
Showing 1 changed file with 46 additions and 34 deletions.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -216,44 +216,41 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
 
   /**
    * Generates an explicit feedback dataset for testing ALS.
-   *
    * @param numUsers number of users
    * @param numItems number of items
    * @param rank rank
-   * @param trainingFraction fraction for training
-   * @param testFraction fraction for test
-   * @param noiseLevel noise level for additive Gaussian noise on training data
+   * @param noiseStd the standard deviation of additive Gaussian noise on training data
    * @param seed random seed
    * @return (training, test)
    */
   def genExplicitTestData(
       numUsers: Int,
       numItems: Int,
       rank: Int,
-      trainingFraction: Double,
-      testFraction: Double,
-      noiseLevel: Double = 0.0,
+      noiseStd: Double = 0.0,
       seed: Long = 11L): (RDD[Rating], RDD[Rating]) = {
+    val trainingFraction = 0.6
+    val testFraction = 0.3
     val totalFraction = trainingFraction + testFraction
-    require(totalFraction <= 1.0)
     val random = new Random(seed)
     val userFactors = genFactors(numUsers, rank, random)
     val itemFactors = genFactors(numItems, rank, random)
     val training = ArrayBuffer.empty[Rating]
     val test = ArrayBuffer.empty[Rating]
     for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
-      val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
       val x = random.nextDouble()
       if (x < totalFraction) {
+        val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
         if (x < trainingFraction) {
-          val noise = noiseLevel * random.nextGaussian()
+          val noise = noiseStd * random.nextGaussian()
           training += Rating(userId, itemId, rating + noise.toFloat)
         } else {
           test += Rating(userId, itemId, rating)
         }
       }
     }
-    logInfo(s"Generated ${training.size} ratings for training and ${test.size} for test.")
+    logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " +
+      s"and ${test.size} for test.")
     (sc.parallelize(training, 2), sc.parallelize(test, 2))
   }
 
@@ -262,15 +259,15 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
    * @param numUsers number of users
    * @param numItems number of items
    * @param rank rank
-   * @param noiseLevel standard deviation of Gaussian noise on training data
+   * @param noiseStd the standard deviation of additive Gaussian noise on training data
    * @param seed random seed
    * @return (training, test)
    */
   def genImplicitTestData(
       numUsers: Int,
       numItems: Int,
       rank: Int,
-      noiseLevel: Double = 0.0,
+      noiseStd: Double = 0.0,
       seed: Long = 11L): (RDD[Rating], RDD[Rating]) = {
     // The assumption of the implicit feedback model is that unobserved ratings are more likely to
     // be negatives.
@@ -290,11 +287,13 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
       val observed = random.nextDouble() < threshold
       if (observed) {
         val x = random.nextDouble()
-        if (x < trainingFraction) {
-          val noise = noiseLevel * random.nextGaussian()
-          training += Rating(userId, itemId, rating + noise.toFloat)
-        } else if (x < totalFraction) {
-          test += Rating(userId, itemId, rating)
+        if (x < totalFraction) {
+          if (x < trainingFraction) {
+            val noise = noiseStd * random.nextGaussian()
+            training += Rating(userId, itemId, rating + noise.toFloat)
+          } else {
+            test += Rating(userId, itemId, rating)
+          }
         }
       }
     }
@@ -328,26 +327,37 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
     ids.toSeq.sorted.map(id => (id, Array.fill(rank)(a + random.nextFloat() * width)))
   }
 
+  /**
+   * Test ALS using the given training/test splits and parameters.
+   * @param training training dataset
+   * @param test test dataset
+   * @param rank rank of the matrix factorization
+   * @param maxIter max number of iterations
+   * @param regParam regularization constant
+   * @param implicitPrefs whether to use implicit preference
+   * @param numUserBlocks number of user blocks
+   * @param numItemBlocks number of item blocks
+   * @param targetRMSE target test RMSE
+   */
   def testALS(
       training: RDD[Rating],
       test: RDD[Rating],
       rank: Int,
       maxIter: Int,
       regParam: Double,
       implicitPrefs: Boolean = false,
-      alpha: Double = 1.0,
-      targetRMSE: Double,
       numUserBlocks: Int = 2,
-      numItemBlocks: Int = 3): Unit = {
+      numItemBlocks: Int = 3,
+      targetRMSE: Double = 0.05): Unit = {
     val sqlContext = this.sqlContext
     import sqlContext.{createSchemaRDD, symbolToUnresolvedAttribute}
     val als = new ALS()
       .setRank(rank)
       .setRegParam(regParam)
       .setImplicitPrefs(implicitPrefs)
-      .setAlpha(alpha)
       .setNumUserBlocks(numUserBlocks)
       .setNumItemBlocks(numItemBlocks)
+    val alpha = als.getAlpha
     val model = als.fit(training)
     val predictions = model.transform(test)
       .select('rating, 'prediction)
@@ -356,6 +366,9 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
       }
     val rmse =
       if (implicitPrefs) {
+        // TODO: Use a better (rank-based?) evaluation metric for implicit feedback.
+        // We limit the ratings and the predictions to interval [0, 1] and compute the weighted RMSE
+        // with the confidence scores as weights.
         val (totalWeight, weightedSumSq) = predictions.map { case (rating, prediction) =>
           val confidence = 1.0 + alpha * math.abs(rating)
           val rating01 = math.max(math.min(rating, 1.0), 0.0)
@@ -378,45 +391,44 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
   }
 
   test("exact rank-1 matrix") {
-    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1,
-      trainingFraction = 0.6, testFraction = 0.3)
+    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1)
     testALS(training, test, maxIter = 1, rank = 1, regParam = 1e-5, targetRMSE = 0.001)
     testALS(training, test, maxIter = 1, rank = 2, regParam = 1e-5, targetRMSE = 0.001)
   }
 
   test("approximate rank-1 matrix") {
-    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1,
-      trainingFraction = 0.6, testFraction = 0.3, noiseLevel = 0.01)
+    val (training, test) =
+      genExplicitTestData(numUsers = 20, numItems = 40, rank = 1, noiseStd = 0.01)
     testALS(training, test, maxIter = 2, rank = 1, regParam = 0.01, targetRMSE = 0.02)
     testALS(training, test, maxIter = 2, rank = 2, regParam = 0.01, targetRMSE = 0.02)
   }
 
   test("approximate rank-2 matrix") {
-    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 2,
-      trainingFraction = 0.6, testFraction = 0.3, noiseLevel = 0.01)
+    val (training, test) =
+      genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
     testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03)
     testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03)
   }
 
   test("different block settings") {
-    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 2,
-      trainingFraction = 0.6, testFraction = 0.3, noiseLevel = 0.01)
+    val (training, test) =
+      genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
     for ((numUserBlocks, numItemBlocks) <- Seq((1, 1), (1, 2), (2, 1), (2, 2))) {
       testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03,
         numUserBlocks = numUserBlocks, numItemBlocks = numItemBlocks)
     }
   }
 
   test("more blocks than ratings") {
-    val (training, test) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1,
-      trainingFraction = 0.7, testFraction = 0.3)
+    val (training, test) =
+      genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
     testALS(training, test, maxIter = 2, rank = 1, regParam = 1e-4, targetRMSE = 0.002,
      numItemBlocks = 5, numUserBlocks = 5)
   }
 
   test("implicit feedback") {
-    val (training, test) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2,
-      noiseLevel = 0.01)
+    val (training, test) =
+      genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
     testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true,
       targetRMSE = 0.3)
   }