Skip to content

Commit

Permalink
made fixes from code review
Browse files Browse the repository at this point in the history
  • Loading branch information
leahmcguire committed Mar 11, 2015
1 parent fb0a5c7 commit 01baad7
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 26 deletions.
4 changes: 2 additions & 2 deletions docs/mllib-naive-bayes.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ and use it for prediction.
MLlib supports [multinomial naive
Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes)
and [Bernoulli naive Bayes] (http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html).
Which are typically used for [document classification]
These models are typically used for [document classification]
(http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html).
Within that context, each observation is a document and each
feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or
a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes).
Feature values must be nonnegative.The model type is selected with on optional parameter
Feature values must be nonnegative. The model type is selected with an optional parameter
"Multinomial" or "Bernoulli" with "Multinomial" as the default.
[Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by
setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,15 @@ class NaiveBayesModel private[mllib] (
val modelType: String)
extends ClassificationModel with Serializable with Saveable {

def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) =
this(labels, pi, theta, NaiveBayes.Multinomial.toString)

private val brzPi = new BDV[Double](pi)
private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t

// Bernoulli scoring requires log(condprob) if 1 log(1-condprob) if 0
// this precomputes log(1.0 - exp(theta)) and its sum for linear algebra application
// of this condition in predict function
// Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
// This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
// application of this condition (in predict function).
private val (brzNegTheta, brzNegThetaSum) = NaiveBayes.ModelType.fromString(modelType) match {
case NaiveBayes.Multinomial => (None, None)
case NaiveBayes.Bernoulli =>
Expand Down Expand Up @@ -186,8 +186,6 @@ class NaiveBayes private (
private var lambda: Double,
private var modelType: NaiveBayes.ModelType) extends Serializable with Logging {

def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial)

def this() = this(1.0, NaiveBayes.Multinomial)

/** Set the smoothing parameter. Default: 1.0. */
Expand All @@ -202,6 +200,7 @@ class NaiveBayes private (
this
}

def getModelType(): NaiveBayes.ModelType = this.modelType

/**
* Run the algorithm with the configured parameters on an input RDD of LabeledPoint entries.
Expand Down Expand Up @@ -301,10 +300,9 @@ object NaiveBayes {
* @param lambda The smoothing parameter
*/
def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
new NaiveBayes(lambda).run(input)
new NaiveBayes(lambda, NaiveBayes.Multinomial).run(input)
}


/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
*
Expand All @@ -327,11 +325,7 @@ object NaiveBayes {
new NaiveBayes(lambda, MODELTYPE.fromString(modelType)).run(input)
}


/**
* Model types supported in Naive Bayes:
* multinomial and Bernoulli currently supported
*/
/** Provides static methods for using ModelType. */
sealed abstract class ModelType

object MODELTYPE {
Expand All @@ -348,10 +342,12 @@ object NaiveBayes {

final val ModelType = MODELTYPE

/** Constant for specifying ModelType parameter: multinomial model */
final val Multinomial: ModelType = new ModelType {
override def toString: String = ModelType.MULTINOMIAL_STRING
}

/** Constant for specifying ModelType parameter: bernoulli model */
final val Bernoulli: ModelType = new ModelType {
override def toString: String = ModelType.BERNOULLI_STRING
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ object NaiveBayesSuite {
for (i <- 0 until nPoints) yield {
val y = calcLabel(rnd.nextDouble(), _pi)
val xi = dataModel match {
case NaiveBayes.Bernoulli => Array.tabulate[Double] (D) {j =>
case NaiveBayes.Bernoulli => Array.tabulate[Double] (D) { j =>
if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0
}
case NaiveBayes.Multinomial =>
Expand Down Expand Up @@ -118,23 +118,15 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
).map(_.map(math.log))

val testData = NaiveBayesSuite.generateNaiveBayesInput(
pi,
theta,
nPoints,
42,
NaiveBayes.Multinomial)
pi, theta, nPoints, 42, NaiveBayes.Multinomial)
val testRDD = sc.parallelize(testData, 2)
testRDD.cache()

val model = NaiveBayes.train(testRDD, 1.0, "multinomial")
validateModelFit(pi, theta, model)

val validationData = NaiveBayesSuite.generateNaiveBayesInput(
pi,
theta,
nPoints,
17,
NaiveBayes.Multinomial)
pi, theta, nPoints, 17, NaiveBayes.Multinomial)
val validationRDD = sc.parallelize(validationData, 2)

// Test prediction on RDD.
Expand Down

0 comments on commit 01baad7

Please sign in to comment.