Skip to content

Commit

Permalink
Updated changes re-comments. Got rid of verbose populateMatrix method…
Browse files Browse the repository at this point in the history
…. Public api now has string instead of enumeration. Docs are updated."
  • Loading branch information
leahmcguire committed Jan 21, 2015
1 parent ce73c63 commit 4a3676d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 30 deletions.
17 changes: 10 additions & 7 deletions docs/mllib-naive-bayes.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ compute the conditional probability distribution of label given an observation
and use it for prediction.

MLlib supports [multinomial naive
Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes),
which is typically used for [document
classification](http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html).
Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes)
and [Bernoulli naive Bayes] (http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html).
Which are typically used for [document classification]
(http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html).
Within that context, each observation is a document and each
feature represents a term whose value is the frequency of the term.
Feature values must be nonnegative to represent term frequencies.
feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or
a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes).
Feature values must be nonnegative.The model type is selected with on optional parameter
"Multinomial" or "Bernoulli" with "Multinomial" as the default.
[Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by
setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature
vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of
Expand All @@ -32,7 +35,7 @@ sparsity. Since the training data is only used once, it is not necessary to cach
[NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements
multinomial naive Bayes. It takes an RDD of
[LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional
smoothing parameter `lambda` as input, and output a
smoothing parameter `lambda` as input, an optional model type parameter (default is Multinomial), and outputs a
[NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which
can be used for evaluation and prediction.

Expand All @@ -51,7 +54,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0)
val test = splits(1)

val model = NaiveBayes.train(training, lambda = 1.0)
val model = NaiveBayes.train(training, lambda = 1.0, model = "Multinomial")

val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
package org.apache.spark.mllib.classification

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis}
import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels
import breeze.numerics.{exp => brzExp, log => brzLog}

import org.apache.spark.{SparkException, Logging}
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels
import org.apache.spark.rdd.RDD


Expand Down Expand Up @@ -52,29 +53,14 @@ class NaiveBayesModel private[mllib] (
val theta: Array[Array[Double]],
val model: NaiveBayesModels) extends ClassificationModel with Serializable {

def populateMatrix(arrayIn: Array[Array[Double]],
matrixIn: BDM[Double],
transformation: (Double) => Double = (x) => x) = {
var i = 0
while (i < arrayIn.length) {
var j = 0
while (j < arrayIn(i).length) {
matrixIn(i, j) = transformation(theta(i)(j))
j += 1
}
i += 1
}
}

private val brzPi = new BDV[Double](pi)
private val brzTheta = new BDM[Double](theta.length, theta(0).length)
populateMatrix(theta, brzTheta)
private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t

private val brzNegTheta: Option[BDM[Double]] = model match {
case NaiveBayesModels.Multinomial => None
case NaiveBayesModels.Bernoulli =>
val negTheta = new BDM[Double](theta.length, theta(0).length)
populateMatrix(theta, negTheta, (x) => math.log(1.0 - math.exp(x)))
val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0)
//((x) => math.log(1.0 - math.exp(x))
Option(negTheta)
}

Expand Down Expand Up @@ -244,7 +230,7 @@ object NaiveBayes {
* @param model The type of NB model to fit from the enumeration NaiveBayesModels, can be
* Multinomial or Bernoulli
*/
def train(input: RDD[LabeledPoint], lambda: Double, model: NaiveBayesModels): NaiveBayesModel = {
new NaiveBayes(lambda, model).run(input)
def train(input: RDD[LabeledPoint], lambda: Double, model: String): NaiveBayesModel = {
new NaiveBayes(lambda, NaiveBayesModels.withName(model)).run(input)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
val testRDD = sc.parallelize(testData, 2)
testRDD.cache()

val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Multinomial)
val model = NaiveBayes.train(testRDD, 1.0, "Multinomial")
validateModelFit(pi, theta, model)

val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17, NaiveBayesModels.Multinomial)
Expand All @@ -140,11 +140,12 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2
).map(_.map(math.log))


val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 45, NaiveBayesModels.Bernoulli)
val testRDD = sc.parallelize(testData, 2)
testRDD.cache()

val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Bernoulli) ///!!! this gives same result on both models check the math
val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") ///!!! this gives same result on both models check the math
validateModelFit(pi, theta, model)

val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 20, NaiveBayesModels.Bernoulli)
Expand Down

0 comments on commit 4a3676d

Please sign in to comment.