diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md index d5b044d94fdd7..a71b93fe0daf4 100644 --- a/docs/mllib-naive-bayes.md +++ b/docs/mllib-naive-bayes.md @@ -13,12 +13,15 @@ compute the conditional probability distribution of label given an observation and use it for prediction. MLlib supports [multinomial naive -Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes), -which is typically used for [document -classification](http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html). +Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes) +and [Bernoulli naive Bayes] (http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html). +Which are typically used for [document classification] +(http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html). Within that context, each observation is a document and each -feature represents a term whose value is the frequency of the term. -Feature values must be nonnegative to represent term frequencies. +feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or +a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes). +Feature values must be nonnegative.The model type is selected with on optional parameter +"Multinomial" or "Bernoulli" with "Multinomial" as the default. [Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of @@ -32,7 +35,7 @@ sparsity. Since the training data is only used once, it is not necessary to cach [NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements multinomial naive Bayes. It takes an RDD of [LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional -smoothing parameter `lambda` as input, and output a +smoothing parameter `lambda` as input, an optional model type parameter (default is Multinomial), and outputs a [NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which can be used for evaluation and prediction. @@ -51,7 +54,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0) val test = splits(1) -val model = NaiveBayes.train(training, lambda = 1.0) +val model = NaiveBayes.train(training, lambda = 1.0, model = "Multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 12b5a0f4178f6..cadfe85c76a19 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -18,12 +18,13 @@ package org.apache.spark.mllib.classification import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} -import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels +import breeze.numerics.{exp => brzExp, log => brzLog} import org.apache.spark.{SparkException, Logging} import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.rdd.RDD @@ -52,29 +53,14 @@ class NaiveBayesModel private[mllib] ( val theta: Array[Array[Double]], val model: NaiveBayesModels) extends ClassificationModel with Serializable { - def populateMatrix(arrayIn: Array[Array[Double]], - matrixIn: BDM[Double], - transformation: (Double) => Double = (x) => x) = { - var i = 0 - while (i < arrayIn.length) { - var j = 0 - while (j < arrayIn(i).length) { - matrixIn(i, j) = transformation(theta(i)(j)) - j += 1 - } - i += 1 - } - } - private val brzPi = new BDV[Double](pi) - private val brzTheta = new BDM[Double](theta.length, theta(0).length) - populateMatrix(theta, brzTheta) + private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t private val brzNegTheta: Option[BDM[Double]] = model match { case NaiveBayesModels.Multinomial => None case NaiveBayesModels.Bernoulli => - val negTheta = new BDM[Double](theta.length, theta(0).length) - populateMatrix(theta, negTheta, (x) => math.log(1.0 - math.exp(x))) + val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) + //((x) => math.log(1.0 - math.exp(x)) Option(negTheta) } @@ -244,7 +230,7 @@ object NaiveBayes { * @param model The type of NB model to fit from the enumeration NaiveBayesModels, can be * Multinomial or Bernoulli */ - def train(input: RDD[LabeledPoint], lambda: Double, model: NaiveBayesModels): NaiveBayesModel = { - new NaiveBayes(lambda, model).run(input) + def train(input: RDD[LabeledPoint], lambda: Double, model: String): NaiveBayesModel = { + new NaiveBayes(lambda, NaiveBayesModels.withName(model)).run(input) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 44ba6118eb61d..d269377c7c9d7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -117,7 +117,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Multinomial) + val model = NaiveBayes.train(testRDD, 1.0, "Multinomial") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17, NaiveBayesModels.Multinomial) @@ -140,11 +140,12 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2 ).map(_.map(math.log)) + val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 45, NaiveBayesModels.Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Bernoulli) ///!!! this gives same result on both models check the math + val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") ///!!! this gives same result on both models check the math validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 20, NaiveBayesModels.Bernoulli)