salesforce · wsuchy · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019
@@ -64,7 +64,7 @@ abstract class OpOneHotVectorizer[T <: FeatureType]
   uid: String = UID[OpOneHotVectorizer[_]]
 )(implicit tti: TypeTag[T], ttiv: TypeTag[T#Value])
   extends SequenceEstimator[T, OPVector](operationName = operationName, uid = uid)
-    with VectorizerDefaults with PivotParams with CleanTextFun with SaveOthersParams
+    with PivotParams with CleanTextFun with SaveOthersParams
     with TrackNullsParam with MinSupportParam with OneHotFun with MaxPctCardinalityParams {
 
   protected def convertToSeqOfMaps(dataset: Dataset[Seq[T#Value]]): RDD[Seq[Map[String, Int]]]
@@ -146,7 +146,7 @@ abstract class OpOneHotVectorizerModel[T <: FeatureType]
   uid: String
 )(implicit tti: TypeTag[T])
   extends SequenceModel[T, OPVector](operationName = operationName, uid = uid)
-    with VectorizerDefaults with CleanTextFun with OneHotModelFun[T] {
+    with CleanTextFun with OneHotModelFun[T] {
 
   def transformFn: Seq[T] => OPVector = pivotFn(topValues, shouldCleanText, shouldTrackNulls)
 

@@ -35,19 +35,22 @@ import com.salesforce.op.features.FeatureLike
 import com.salesforce.op.features.types._
 import com.salesforce.op.stages.base.sequence.SequenceModel
 import com.salesforce.op.test.TestOpVectorColumnType.IndCol
-import com.salesforce.op.test.{TestFeatureBuilder, TestOpVectorMetadataBuilder, TestOpWorkflowBuilder, TestSparkContext}
+import com.salesforce.op.test._
 import com.salesforce.op.utils.spark.OpVectorMetadata
 import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.sql.Dataset
 import org.junit.runner.RunWith
 import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
 
 
 @RunWith(classOf[JUnitRunner])
-class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts {
+class OpSetVectorizerTest extends
+  OpEstimatorSpec[OPVector, SequenceModel[MultiPickList, OPVector], OpSetVectorizer[MultiPickList]]
+  with AttributeAsserts {
 
   val log = LoggerFactory.getLogger(this.getClass)
 
@@ -57,15 +60,25 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
     (Seq("c"), Seq("x", "y")),
     (Seq("C ", "A."), Seq("Z", "Z", "Z"))
   )
-  val expectedData = Array(
+
+  override val expectedResult = Seq(
+    Vectors.sparse(10, Array(0, 2, 5), Array(1.0, 1.0, 1.0)),
+    Vectors.sparse(10, Array(0, 6, 7), Array(1.0, 1.0, 1.0)),
+    Vectors.sparse(10, Array(1, 5, 6), Array(1.0, 1.0, 1.0)),
+    Vectors.sparse(10, Array(0, 1, 7), Array(1.0, 1.0, 1.0))
+  ).map(_.toOPVector)
+
+  val expectedData = Seq(
     Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0, 0.0),
     Vectors.dense(1.0, 0.0, 0.0, 0.0, 2.0, 0.0),
     Vectors.dense(0.0, 1.0, 0.0, 1.0, 1.0, 0.0),
     Vectors.dense(1.0, 1.0, 0.0, 0.0, 1.0, 0.0)
   ).map(_.toOPVector)
 
-  val (dataSet, top, bot) = TestFeatureBuilder("top", "bot", data.map(v =>
+
+  val (inputData, top, bot) = TestFeatureBuilder("top", "bot", data.map(v =>
     v._1.toMultiPickList -> v._2.toMultiPickList))
+
   val (dataSetEmpty, _, _) = TestFeatureBuilder(top.name, bot.name,
     Seq[(MultiPickList, MultiPickList)](
       (Seq("a", "b").toMultiPickList, MultiPickList.empty),
@@ -77,38 +90,37 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
   val (dataSetAllEmpty, _) =
     TestFeatureBuilder(top.name, Seq[MultiPickList](MultiPickList.empty, MultiPickList.empty, MultiPickList.empty))
 
-  val vectorizer = new OpSetVectorizer[MultiPickList]().setInput(top, bot).setMinSupport(0).setTopK(10)
-
+  val estimator = new OpSetVectorizer[MultiPickList]().setInput(top, bot).setMinSupport(0).setTopK(10)
 
   Spec[OpSetVectorizer[_]] should "take an array of features as input and return a single vector feature" in {
-    val vector = vectorizer.getOutput()
-    vector.name shouldBe vectorizer.getOutputFeatureName
+    val vector = estimator.getOutput()
+    vector.name shouldBe estimator.getOutputFeatureName
     vector.typeName shouldBe FeatureType.typeName[OPVector]
     vector.isResponse shouldBe false
-    vector.originStage shouldBe vectorizer
+    vector.originStage shouldBe estimator
     vector.parents should contain theSameElementsAs Array(top, bot)
   }
 
   it should "return the a fitted vectorizer with the correct parameters" in {
-    val fitted = vectorizer.fit(dataSet)
+    val fitted = estimator.fit(inputData)
     fitted.isInstanceOf[SequenceModel[_, _]]
     val vectorMetadata = fitted.getMetadata()
     val expectedMeta = TestOpVectorMetadataBuilder(
-      vectorizer,
+      estimator,
       top -> List(IndCol(Some("A")), IndCol(Some("C")), IndCol(Some("B")), IndCol(Some("OTHER")),
         IndCol(Some(TransmogrifierDefaults.NullString))),
       bot -> List(IndCol(Some("X")), IndCol(Some("Y")), IndCol(Some("Z")), IndCol(Some("OTHER")),
         IndCol(Some(TransmogrifierDefaults.NullString)))
     )
-    OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
+    OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
     fitted.getInputFeatures() shouldBe Array(top, bot)
-    fitted.parent shouldBe vectorizer
+    fitted.parent shouldBe estimator
   }
 
   it should "return the expected vector with the default param settings" in {
-    val fitted = vectorizer.fit(dataSet)
-    val transformed = fitted.transform(dataSet)
-    val vector = vectorizer.getOutput()
+    val fitted = estimator.fit(inputData)
+    val transformed = fitted.transform(inputData)
+    val vector = estimator.getOutput()
     val result = transformed.collect(vector)
     val expected = Array(
       Vectors.sparse(10, Array(0, 2, 5), Array(1.0, 1.0, 1.0)),
@@ -124,9 +136,9 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
   }
 
   it should "not clean the variable names when clean text is set to false" in {
-    val fitted = vectorizer.setCleanText(false).fit(dataSet)
-    val transformed = fitted.transform(dataSet)
-    val vector = vectorizer.getOutput()
+    val fitted = estimator.setCleanText(false).fit(inputData)
+    val transformed = fitted.transform(inputData)
+    val vector = estimator.getOutput()
     val result = transformed.collect(vector)
     val expected = Array(
       Vectors.sparse(13, Array(0, 3, 7), Array(1.0, 1.0, 1.0)),
@@ -140,36 +152,36 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
     result shouldBe expected
     val vectorMetadata = fitted.getMetadata()
     val expectedMeta = TestOpVectorMetadataBuilder(
-      vectorizer,
+      estimator,
       top -> List(IndCol(Some("a")), IndCol(Some("A.")), IndCol(Some("C ")), IndCol(Some("b")), IndCol(Some("c")),
         IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString))),
       bot -> List(IndCol(Some("x")), IndCol(Some("y")), IndCol(Some("Z")), IndCol(Some("z")), IndCol(Some("OTHER")),
         IndCol(Some(TransmogrifierDefaults.NullString)))
     )
-    OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
+    OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
   }
 
   it should "throw an error if you try to set the topK to 0 or a negative number" in {
-    intercept[java.lang.IllegalArgumentException](vectorizer.setTopK(0))
-    intercept[java.lang.IllegalArgumentException](vectorizer.setTopK(-1))
+    intercept[java.lang.IllegalArgumentException](estimator.setTopK(0))
+    intercept[java.lang.IllegalArgumentException](estimator.setTopK(-1))
   }
 
   it should "return only the specified number of elements when top K is set" in {
-    val fitted = vectorizer.setCleanText(true).setTopK(1).fit(dataSet)
-    val transformed = fitted.transform(dataSet)
-    val vector = vectorizer.getOutput()
+    val fitted = estimator.setCleanText(true).setTopK(1).fit(inputData)
+    val transformed = fitted.transform(inputData)
+    val vector = estimator.getOutput()
     val result = transformed.collect(vector)
     val field = transformed.schema(vector.name)
     val expect = OpVectorMetadata("", field.metadata).columns.map(c => !c.isOtherIndicator)
     assertNominal(field, expect, result)
     result shouldBe expectedData
-    vectorizer.setTopK(10)
+    estimator.setTopK(10)
   }
 
   it should "return only elements that exceed the min support value" in {
-    val fitted = vectorizer.setCleanText(true).setMinSupport(3).fit(dataSet)
-    val transformed = fitted.transform(dataSet)
-    val vector = vectorizer.getOutput()
+    val fitted = estimator.setCleanText(true).setMinSupport(3).fit(inputData)
+    val transformed = fitted.transform(inputData)
+    val vector = estimator.getOutput()
     val result = transformed.collect(vector)
     transformed.collect(vector) shouldBe Array(
       Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0),
@@ -184,7 +196,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
 
   it should "return a vector with elements only in the other & null columns and not throw errors when passed data" +
     " it was not trained with" in {
-    val fitted = vectorizer.setMinSupport(0).setTopK(10).fit(dataSetEmpty)
+    val fitted = estimator.setMinSupport(0).setTopK(10).fit(dataSetEmpty)
     val vector = fitted.getOutput()
     val transformed = fitted.transform(dataSetEmpty)
     val result = transformed.collect(vector)
@@ -199,20 +211,20 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
     result shouldBe expected
     val vectorMetadata = fitted.getMetadata()
     val expectedMeta = TestOpVectorMetadataBuilder(
-      vectorizer,
+      estimator,
       top -> List(
         IndCol(Some("A")), IndCol(Some("B")), IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString))
       ),
       bot -> List(IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString)))
     )
-    OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
+    OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
     val expected2 = Array(
       Vectors.dense(1.0, 1.0, 0.0, 0.0, 1.0, 0.0),
       Vectors.dense(1.0, 0.0, 0.0, 0.0, 2.0, 0.0),
       Vectors.dense(0.0, 0.0, 1.0, 0.0, 2.0, 0.0),
       Vectors.dense(1.0, 0.0, 1.0, 0.0, 1.0, 0.0)
     ).map(_.toOPVector)
-    val transformed2 = fitted.transform(dataSet)
+    val transformed2 = fitted.transform(inputData)
     transformed2.collect(vector) shouldBe expected2
   }
 
@@ -246,12 +258,12 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
       Vectors.dense(0.0, 0.0, 1.0, 2.0),
       Vectors.dense(1.0, 0.0, 1.0, 1.0)
     ).map(_.toOPVector)
-    val transformed2 = fitted.transform(dataSet)
+    val transformed2 = fitted.transform(inputData)
     transformed2.collect(vector) shouldBe expected2
   }
 
   it should "work even if all features passed in are empty" in {
-    val fitted = vectorizer.setInput(top).setTopK(10).fit(dataSetAllEmpty)
+    val fitted = estimator.setInput(top).setTopK(10).fit(dataSetAllEmpty)
     val vector = fitted.getOutput()
     val transformed = fitted.transform(dataSetAllEmpty)
     val expected = Array(Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0)).map(_.toOPVector)
@@ -262,17 +274,17 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
     result shouldBe expected
     val vectorMetadata = fitted.getMetadata()
     val expectedMeta = TestOpVectorMetadataBuilder(
-      vectorizer,
+      estimator,
       top -> List(IndCol(Some("OTHER")), IndCol(Some(TransmogrifierDefaults.NullString)))
     )
-    OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
+    OpVectorMetadata(estimator.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta
   }
 
   it should "be implemented as 'pivot' shortcut" in {
     val result = top.pivot(others = Array(bot), topK = 1, cleanText = true, minSupport = 0, trackNulls = true)
     val df = result.originStage
-      .asInstanceOf[Estimator[_]].fit(dataSet)
-      .asInstanceOf[Transformer].transform(dataSet)
+      .asInstanceOf[Estimator[_]].fit(inputData)
+      .asInstanceOf[Transformer].transform(inputData)
 
     result.originStage shouldBe a[OpSetVectorizer[_]]
     val actual = df.collect(result)
@@ -410,7 +422,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeA
       .map(c => !(c.isOtherIndicator && c.parentFeatureType.head == FeatureType.typeName[MultiPickList]))
     assertNominal(field, expect, result)
     val expected = Array.fill(6)(OPVector.empty)
-    result should contain theSameElementsAs  expected
+    result should contain theSameElementsAs expected
   }
 
   it should "process multiple columns of numerics, PickLists, and MultiPickLists using the vectorize shortcut" in {

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.base.sequence.SequenceModel
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
+import org.apache.spark.ml.linalg.Vectors
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+
+
+@RunWith(classOf[JUnitRunner])
+class OpTextPivotVectorizerTest
+  extends OpEstimatorSpec[OPVector, SequenceModel[Text, OPVector], OpTextPivotVectorizer[Text]] {
+
+  lazy val (inputData, f1, f2) = TestFeatureBuilder("text1", "text2",
+    Seq[(Text, Text)](
+      ("hello world".toText, "Hello world!".toText),
+      ("hello world".toText, "What's up".toText),
+      ("good evening".toText, "How are you doing, my friend?".toText),
+      ("hello world".toText, "Not bad, my friend.".toText),
+      (Text.empty, Text.empty)
+    )
+  )
+
+  /**
+   * Estimator instance to be tested
+   */
+  override val estimator: OpTextPivotVectorizer[Text] = new OpTextPivotVectorizer()
+    .setMinSupport(1)
+    .setTopK(2)
+    .setInput(f1, f2)
+
+  /**
+   * Expected result of the transformer applied on the Input Dataset
+   */
+  override val expectedResult: Seq[OPVector] = Seq(
+    Vectors.sparse(8, Array(0, 4), Array(1.0, 1.0)),
+    Vectors.sparse(8, Array(0, 6), Array(1.0, 1.0)),
+    Vectors.sparse(8, Array(1, 5), Array(1.0, 1.0)),
+    Vectors.sparse(8, Array(0, 6), Array(1.0, 1.0)),
+    Vectors.sparse(8, Array(3, 7), Array(1.0, 1.0))
+  ).map(_.toOPVector)
+}