Skip to content

Commit

Permalink
Improvements of Vectorizer tests (#291)
Browse files Browse the repository at this point in the history
  • Loading branch information
wsuchy committed Apr 17, 2019
1 parent 462226a commit f228f26
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ abstract class OpOneHotVectorizer[T <: FeatureType]
uid: String = UID[OpOneHotVectorizer[_]]
)(implicit tti: TypeTag[T], ttiv: TypeTag[T#Value])
extends SequenceEstimator[T, OPVector](operationName = operationName, uid = uid)
with VectorizerDefaults with PivotParams with CleanTextFun with SaveOthersParams
with PivotParams with CleanTextFun with SaveOthersParams
with TrackNullsParam with MinSupportParam with OneHotFun with MaxPctCardinalityParams {

protected def convertToSeqOfMaps(dataset: Dataset[Seq[T#Value]]): RDD[Seq[Map[String, Int]]]
Expand Down Expand Up @@ -146,7 +146,7 @@ abstract class OpOneHotVectorizerModel[T <: FeatureType]
uid: String
)(implicit tti: TypeTag[T])
extends SequenceModel[T, OPVector](operationName = operationName, uid = uid)
with VectorizerDefaults with CleanTextFun with OneHotModelFun[T] {
with CleanTextFun with OneHotModelFun[T] {

def transformFn: Seq[T] => OPVector = pivotFn(topValues, shouldCleanText, shouldTrackNulls)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,22 @@ import com.salesforce.op.features.FeatureLike
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.SequenceModel
import com.salesforce.op.test.TestOpVectorColumnType.IndCol
import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestOpWorkflowBuilder, TestSparkContext}
import com.salesforce.op.test._
import com.salesforce.op.utils.spark.OpVectorMetadata
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.{Estimator, Transformer}
import org.apache.spark.sql.Dataset
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner
import org.slf4j.LoggerFactory


@RunWith(classOf[JUnitRunner])
class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts {
class OpSetVectorizerTest extends
OpEstimatorSpec[OPVector, SequenceModel[MultiPickList, OPVector], OpSetVectorizer[MultiPickList]]
with AttributeAsserts {

val log = LoggerFactory.getLogger(this.getClass)

Expand All @@ -57,15 +60,25 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
(Seq("c"), Seq("x", "y")),
(Seq("C ", "A."), Seq("Z", "Z", "Z"))
)
val expectedData = Array(

override val expectedResult = Seq(
Vectors.sparse(10, Array(0, 2, 5), Array(1.0, 1.0, 1.0)),
Vectors.sparse(10, Array(0, 6, 7), Array(1.0, 1.0, 1.0)),
Vectors.sparse(10, Array(1, 5, 6), Array(1.0, 1.0, 1.0)),
Vectors.sparse(10, Array(0, 1, 7), Array(1.0, 1.0, 1.0))
).map(_.toOPVector)

val expectedData = Seq(
Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0, 0.0),
Vectors.dense(1.0, 0.0, 0.0, 0.0, 2.0, 0.0),
Vectors.dense(0.0, 1.0, 0.0, 1.0, 1.0, 0.0),
Vectors.dense(1.0, 1.0, 0.0, 0.0, 1.0, 0.0)
).map(_.toOPVector)

val (dataSet, top, bot) = TestFeatureBuilder("top", "bot", data.map(v =>

val (inputData, top, bot) = TestFeatureBuilder("top", "bot", data.map(v =>
v._1.toMultiPickList -> v._2.toMultiPickList))

val (dataSetEmpty, _, _) = TestFeatureBuilder(top.name, bot.name,
Seq[(MultiPickList, MultiPickList)](
(Seq("a", "b").toMultiPickList, MultiPickList.empty),
Expand All @@ -77,38 +90,37 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
val (dataSetAllEmpty, _) =
TestFeatureBuilder(top.name, Seq[MultiPickList](MultiPickList.empty, MultiPickList.empty, MultiPickList.empty))

val vectorizer = new OpSetVectorizer[MultiPickList]().setInput(top, bot).setMinSupport(0).setTopK(10)

val estimator = new OpSetVectorizer[MultiPickList]().setInput(top, bot).setMinSupport(0).setTopK(10)

Spec[OpSetVectorizer[_]] should "take an array of features as input and return a single vector feature" in {
val vector = vectorizer.getOutput()
vector.name shouldBe vectorizer.getOutputFeatureName
val vector = estimator.getOutput()
vector.name shouldBe estimator.getOutputFeatureName
vector.typeName shouldBe FeatureType.typeName[OPVector]
vector.isResponse shouldBe false
vector.originStage shouldBe vectorizer
vector.originStage shouldBe estimator
vector.parents should contain theSameElementsAs Array(top, bot)
}

it should "return the a fitted vectorizer with the correct parameters" in {
val fitted = vectorizer.fit(dataSet)
val fitted = estimator.fit(inputData)
fitted.isInstanceOf[SequenceModel[_, _]]
val vectorMetadata = fitted.getMetadata()
val expectedMeta = TestOpVectorMetadataBuilder(
vectorizer,
estimator,
top -> List(IndCol(Some("A")), IndCol(Some("C")), IndCol(Some("B")), IndCol(Some("OTHER")),
IndCol(Some(TransmogrifierDefaults.NullString))),
bot -> List(IndCol(Some("X")), IndCol(Some("Y")), IndCol(Some("Z")), IndCol(Some("OTHER")),
IndCol(Some(TransmogrifierDefaults.NullString)))
)
OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
fitted.getInputFeatures() shouldBe Array(top, bot)
fitted.parent shouldBe vectorizer
fitted.parent shouldBe estimator
}

it should "return the expected vector with the default param settings" in {
val fitted = vectorizer.fit(dataSet)
val transformed = fitted.transform(dataSet)
val vector = vectorizer.getOutput()
val fitted = estimator.fit(inputData)
val transformed = fitted.transform(inputData)
val vector = estimator.getOutput()
val result = transformed.collect(vector)
val expected = Array(
Vectors.sparse(10, Array(0, 2, 5), Array(1.0, 1.0, 1.0)),
Expand All @@ -124,9 +136,9 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
}

it should "not clean the variable names when clean text is set to false" in {
val fitted = vectorizer.setCleanText(false).fit(dataSet)
val transformed = fitted.transform(dataSet)
val vector = vectorizer.getOutput()
val fitted = estimator.setCleanText(false).fit(inputData)
val transformed = fitted.transform(inputData)
val vector = estimator.getOutput()
val result = transformed.collect(vector)
val expected = Array(
Vectors.sparse(13, Array(0, 3, 7), Array(1.0, 1.0, 1.0)),
Expand All @@ -140,36 +152,36 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
result shouldBe expected
val vectorMetadata = fitted.getMetadata()
val expectedMeta = TestOpVectorMetadataBuilder(
vectorizer,
estimator,
top -> List(IndCol(Some("a")), IndCol(Some("A.")), IndCol(Some("C ")), IndCol(Some("b")), IndCol(Some("c")),
IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString))),
bot -> List(IndCol(Some("x")), IndCol(Some("y")), IndCol(Some("Z")), IndCol(Some("z")), IndCol(Some("OTHER")),
IndCol(Some(TransmogrifierDefaults.NullString)))
)
OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
}

it should "throw an error if you try to set the topK to 0 or a negative number" in {
intercept[java.lang.IllegalArgumentException](vectorizer.setTopK(0))
intercept[java.lang.IllegalArgumentException](vectorizer.setTopK(-1))
intercept[java.lang.IllegalArgumentException](estimator.setTopK(0))
intercept[java.lang.IllegalArgumentException](estimator.setTopK(-1))
}

it should "return only the specified number of elements when top K is set" in {
val fitted = vectorizer.setCleanText(true).setTopK(1).fit(dataSet)
val transformed = fitted.transform(dataSet)
val vector = vectorizer.getOutput()
val fitted = estimator.setCleanText(true).setTopK(1).fit(inputData)
val transformed = fitted.transform(inputData)
val vector = estimator.getOutput()
val result = transformed.collect(vector)
val field = transformed.schema(vector.name)
val expect = OpVectorMetadata("", field.metadata).columns.map(c => !c.isOtherIndicator)
assertNominal(field, expect, result)
result shouldBe expectedData
vectorizer.setTopK(10)
estimator.setTopK(10)
}

it should "return only elements that exceed the min support value" in {
val fitted = vectorizer.setCleanText(true).setMinSupport(3).fit(dataSet)
val transformed = fitted.transform(dataSet)
val vector = vectorizer.getOutput()
val fitted = estimator.setCleanText(true).setMinSupport(3).fit(inputData)
val transformed = fitted.transform(inputData)
val vector = estimator.getOutput()
val result = transformed.collect(vector)
transformed.collect(vector) shouldBe Array(
Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0),
Expand All @@ -184,7 +196,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA

it should "return a vector with elements only in the other & null columns and not throw errors when passed data" +
" it was not trained with" in {
val fitted = vectorizer.setMinSupport(0).setTopK(10).fit(dataSetEmpty)
val fitted = estimator.setMinSupport(0).setTopK(10).fit(dataSetEmpty)
val vector = fitted.getOutput()
val transformed = fitted.transform(dataSetEmpty)
val result = transformed.collect(vector)
Expand All @@ -199,20 +211,20 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
result shouldBe expected
val vectorMetadata = fitted.getMetadata()
val expectedMeta = TestOpVectorMetadataBuilder(
vectorizer,
estimator,
top -> List(
IndCol(Some("A")), IndCol(Some("B")), IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString))
),
bot -> List(IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString)))
)
OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
val expected2 = Array(
Vectors.dense(1.0, 1.0, 0.0, 0.0, 1.0, 0.0),
Vectors.dense(1.0, 0.0, 0.0, 0.0, 2.0, 0.0),
Vectors.dense(0.0, 0.0, 1.0, 0.0, 2.0, 0.0),
Vectors.dense(1.0, 0.0, 1.0, 0.0, 1.0, 0.0)
).map(_.toOPVector)
val transformed2 = fitted.transform(dataSet)
val transformed2 = fitted.transform(inputData)
transformed2.collect(vector) shouldBe expected2
}

Expand Down Expand Up @@ -246,12 +258,12 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
Vectors.dense(0.0, 0.0, 1.0, 2.0),
Vectors.dense(1.0, 0.0, 1.0, 1.0)
).map(_.toOPVector)
val transformed2 = fitted.transform(dataSet)
val transformed2 = fitted.transform(inputData)
transformed2.collect(vector) shouldBe expected2
}

it should "work even if all features passed in are empty" in {
val fitted = vectorizer.setInput(top).setTopK(10).fit(dataSetAllEmpty)
val fitted = estimator.setInput(top).setTopK(10).fit(dataSetAllEmpty)
val vector = fitted.getOutput()
val transformed = fitted.transform(dataSetAllEmpty)
val expected = Array(Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0)).map(_.toOPVector)
Expand All @@ -262,17 +274,17 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
result shouldBe expected
val vectorMetadata = fitted.getMetadata()
val expectedMeta = TestOpVectorMetadataBuilder(
vectorizer,
estimator,
top -> List(IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString)))
)
OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
}

it should "be implemented as 'pivot' shortcut" in {
val result = top.pivot(others = Array(bot), topK = 1, cleanText = true, minSupport = 0, trackNulls = true)
val df = result.originStage
.asInstanceOf[Estimator[_]].fit(dataSet)
.asInstanceOf[Transformer].transform(dataSet)
.asInstanceOf[Estimator[_]].fit(inputData)
.asInstanceOf[Transformer].transform(inputData)

result.originStage shouldBe a[OpSetVectorizer[_]]
val actual = df.collect(result)
Expand Down Expand Up @@ -410,7 +422,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
.map(c => !(c.isOtherIndicator && c.parentFeatureType.head == FeatureType.typeName[MultiPickList]))
assertNominal(field, expect, result)
val expected = Array.fill(6)(OPVector.empty)
result should contain theSameElementsAs expected
result should contain theSameElementsAs expected
}

it should "process multiple columns of numerics, PickLists, and MultiPickLists using the vectorize shortcut" in {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.salesforce.op.stages.impl.feature

import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.SequenceModel
import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
import org.apache.spark.ml.linalg.Vectors
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class OpTextPivotVectorizerTest
extends OpEstimatorSpec[OPVector, SequenceModel[Text, OPVector], OpTextPivotVectorizer[Text]] {

lazy val (inputData, f1, f2) = TestFeatureBuilder("text1", "text2",
Seq[(Text, Text)](
("hello world".toText, "Hello world!".toText),
("hello world".toText, "What's up".toText),
("good evening".toText, "How are you doing, my friend?".toText),
("hello world".toText, "Not bad, my friend.".toText),
(Text.empty, Text.empty)
)
)

/**
* Estimator instance to be tested
*/
override val estimator: OpTextPivotVectorizer[Text] = new OpTextPivotVectorizer()
.setMinSupport(1)
.setTopK(2)
.setInput(f1, f2)

/**
* Expected result of the transformer applied on the Input Dataset
*/
override val expectedResult: Seq[OPVector] = Seq(
Vectors.sparse(8, Array(0, 4), Array(1.0, 1.0)),
Vectors.sparse(8, Array(0, 6), Array(1.0, 1.0)),
Vectors.sparse(8, Array(1, 5), Array(1.0, 1.0)),
Vectors.sparse(8, Array(0, 6), Array(1.0, 1.0)),
Vectors.sparse(8, Array(3, 7), Array(1.0, 1.0))
).map(_.toOPVector)
}

0 comments on commit f228f26

Please sign in to comment.