Skip to content

Commit

Permalink
simplify test code
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Jan 10, 2015
1 parent c627de3 commit dd0d0e8
Showing 1 changed file with 46 additions and 34 deletions.
Expand Up @@ -216,44 +216,41 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {

/**
* Generates an explicit feedback dataset for testing ALS.
*
* @param numUsers number of users
* @param numItems number of items
* @param rank rank
* @param trainingFraction fraction for training
* @param testFraction fraction for test
* @param noiseLevel noise level for additive Gaussian noise on training data
* @param noiseStd the standard deviation of additive Gaussian noise on training data
* @param seed random seed
* @return (training, test)
*/
def genExplicitTestData(
numUsers: Int,
numItems: Int,
rank: Int,
trainingFraction: Double,
testFraction: Double,
noiseLevel: Double = 0.0,
noiseStd: Double = 0.0,
seed: Long = 11L): (RDD[Rating], RDD[Rating]) = {
val trainingFraction = 0.6
val testFraction = 0.3
val totalFraction = trainingFraction + testFraction
require(totalFraction <= 1.0)
val random = new Random(seed)
val userFactors = genFactors(numUsers, rank, random)
val itemFactors = genFactors(numItems, rank, random)
val training = ArrayBuffer.empty[Rating]
val test = ArrayBuffer.empty[Rating]
for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
val x = random.nextDouble()
if (x < totalFraction) {
val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
if (x < trainingFraction) {
val noise = noiseLevel * random.nextGaussian()
val noise = noiseStd * random.nextGaussian()
training += Rating(userId, itemId, rating + noise.toFloat)
} else {
test += Rating(userId, itemId, rating)
}
}
}
logInfo(s"Generated ${training.size} ratings for training and ${test.size} for test.")
logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " +
s"and ${test.size} for test.")
(sc.parallelize(training, 2), sc.parallelize(test, 2))
}

Expand All @@ -262,15 +259,15 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
* @param numUsers number of users
* @param numItems number of items
* @param rank rank
* @param noiseLevel standard deviation of Gaussian noise on training data
* @param noiseStd the standard deviation of additive Gaussian noise on training data
* @param seed random seed
* @return (training, test)
*/
def genImplicitTestData(
numUsers: Int,
numItems: Int,
rank: Int,
noiseLevel: Double = 0.0,
noiseStd: Double = 0.0,
seed: Long = 11L): (RDD[Rating], RDD[Rating]) = {
// The assumption of the implicit feedback model is that unobserved ratings are more likely to
// be negatives.
Expand All @@ -290,11 +287,13 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
val observed = random.nextDouble() < threshold
if (observed) {
val x = random.nextDouble()
if (x < trainingFraction) {
val noise = noiseLevel * random.nextGaussian()
training += Rating(userId, itemId, rating + noise.toFloat)
} else if (x < totalFraction) {
test += Rating(userId, itemId, rating)
if (x < totalFraction) {
if (x < trainingFraction) {
val noise = noiseStd * random.nextGaussian()
training += Rating(userId, itemId, rating + noise.toFloat)
} else {
test += Rating(userId, itemId, rating)
}
}
}
}
Expand Down Expand Up @@ -328,26 +327,37 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
ids.toSeq.sorted.map(id => (id, Array.fill(rank)(a + random.nextFloat() * width)))
}

/**
* Test ALS using the given training/test splits and parameters.
* @param training training dataset
* @param test test dataset
* @param rank rank of the matrix factorization
* @param maxIter max number of iterations
* @param regParam regularization constant
* @param implicitPrefs whether to use implicit preference
* @param numUserBlocks number of user blocks
* @param numItemBlocks number of item blocks
* @param targetRMSE target test RMSE
*/
def testALS(
training: RDD[Rating],
test: RDD[Rating],
rank: Int,
maxIter: Int,
regParam: Double,
implicitPrefs: Boolean = false,
alpha: Double = 1.0,
targetRMSE: Double,
numUserBlocks: Int = 2,
numItemBlocks: Int = 3): Unit = {
numItemBlocks: Int = 3,
targetRMSE: Double = 0.05): Unit = {
val sqlContext = this.sqlContext
import sqlContext.{createSchemaRDD, symbolToUnresolvedAttribute}
val als = new ALS()
.setRank(rank)
.setRegParam(regParam)
.setImplicitPrefs(implicitPrefs)
.setAlpha(alpha)
.setNumUserBlocks(numUserBlocks)
.setNumItemBlocks(numItemBlocks)
val alpha = als.getAlpha
val model = als.fit(training)
val predictions = model.transform(test)
.select('rating, 'prediction)
Expand All @@ -356,6 +366,9 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
}
val rmse =
if (implicitPrefs) {
// TODO: Use a better (rank-based?) evaluation metric for implicit feedback.
// We limit the ratings and the predictions to interval [0, 1] and compute the weighted RMSE
// with the confidence scores as weights.
val (totalWeight, weightedSumSq) = predictions.map { case (rating, prediction) =>
val confidence = 1.0 + alpha * math.abs(rating)
val rating01 = math.max(math.min(rating, 1.0), 0.0)
Expand All @@ -378,45 +391,44 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
}

test("exact rank-1 matrix") {
val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1,
trainingFraction = 0.6, testFraction = 0.3)
val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1)
testALS(training, test, maxIter = 1, rank = 1, regParam = 1e-5, targetRMSE = 0.001)
testALS(training, test, maxIter = 1, rank = 2, regParam = 1e-5, targetRMSE = 0.001)
}

test("approximate rank-1 matrix") {
val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1,
trainingFraction = 0.6, testFraction = 0.3, noiseLevel = 0.01)
val (training, test) =
genExplicitTestData(numUsers = 20, numItems = 40, rank = 1, noiseStd = 0.01)
testALS(training, test, maxIter = 2, rank = 1, regParam = 0.01, targetRMSE = 0.02)
testALS(training, test, maxIter = 2, rank = 2, regParam = 0.01, targetRMSE = 0.02)
}

test("approximate rank-2 matrix") {
val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 2,
trainingFraction = 0.6, testFraction = 0.3, noiseLevel = 0.01)
val (training, test) =
genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03)
testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03)
}

test("different block settings") {
val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 2,
trainingFraction = 0.6, testFraction = 0.3, noiseLevel = 0.01)
val (training, test) =
genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
for ((numUserBlocks, numItemBlocks) <- Seq((1, 1), (1, 2), (2, 1), (2, 2))) {
testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03,
numUserBlocks = numUserBlocks, numItemBlocks = numItemBlocks)
}
}

test("more blocks than ratings") {
val (training, test) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1,
trainingFraction = 0.7, testFraction = 0.3)
val (training, test) =
genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
testALS(training, test, maxIter = 2, rank = 1, regParam = 1e-4, targetRMSE = 0.002,
numItemBlocks = 5, numUserBlocks = 5)
}

test("implicit feedback") {
val (training, test) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2,
noiseLevel = 0.01)
val (training, test) =
genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true,
targetRMSE = 0.3)
}
Expand Down

0 comments on commit dd0d0e8

Please sign in to comment.