From 849c37c94c907e22dd193041af3738e7b773ab03 Mon Sep 17 00:00:00 2001 From: Christopher Suchanek Date: Thu, 18 Apr 2019 12:23:47 -0700 Subject: [PATCH 1/3] improved test: --- .../feature/SmartTextMapVectorizerTest.scala | 60 ++++++++++++------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 1082634443..bb32382113 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -31,19 +31,21 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op._ -import com.salesforce.op.features.types._ -import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.stages.base.sequence.SequenceModel +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.salesforce.op.utils.spark.RichDataset._ -import com.salesforce.op.utils.spark.RichMetadata._ +import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith -import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner - +import com.salesforce.op.features.types._ @RunWith(classOf[JUnitRunner]) -class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { - lazy val (data, m1, m2, f1, f2) = TestFeatureBuilder("textMap1", "textMap2", "text1", "text2", +class SmartTextMapVectorizerTest + extends OpEstimatorSpec[OPVector, SequenceModel[TextMap, OPVector], SmartTextMapVectorizer[TextMap]] + with AttributeAsserts { + + lazy val (inputData, m1, m2, f1, f2) = TestFeatureBuilder("textMap1", "textMap2", "text1", "text2", Seq[(TextMap, TextMap, Text, Text)]( (TextMap(Map("text1" -> "hello world", "text2" -> "Hello world!")), TextMap.empty, "hello world".toText, "Hello world!".toText), @@ -71,6 +73,26 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att ) ) + /** + * Estimator instance to be tested + */ + override val estimator: SmartTextMapVectorizer[TextMap] = new SmartTextMapVectorizer[TextMap]() + .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setCleanKeys(false) + .setInput(m1, m2) + + /** + * Expected result of the transformer applied on the Input Dataset + */ + override val expectedResult: Seq[OPVector] = Seq( + Vectors.sparse(9, Array(0, 5, 7), Array(1.0, 1.0, 1.0)), + Vectors.sparse(9, Array(0, 8), Array(1.0, 1.0)), + Vectors.sparse(9, Array(1, 4), Array(1.0, 1.0)), + Vectors.sparse(9, Array(0, 4), Array(1.0, 2.0)), + Vectors.sparse(9, Array(3, 8), Array(1.0, 1.0)) + ).map(_.toOPVector) + + Spec[TextMapStats] should "provide a proper semigroup" in { val data = Seq( TextMapStats(Map( @@ -93,17 +115,14 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att ))) } - Spec[SmartTextMapVectorizer[_]] should "detect one categorical and one non-categorical text feature" in { - val smartMapVectorized = new SmartTextMapVectorizer[TextMap]() - .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) - .setCleanKeys(false) - .setInput(m1, m2).getOutput() + it should "detect one categorical and one non-categorical text feature" in { + val smartMapVectorized = estimator.getOutput() val smartVectorized = new SmartTextVectorizer() .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) .setInput(f1, f2).getOutput() - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true, transformed.collect(smartVectorized)) @@ -136,7 +155,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att .setMaxCardinality(10).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) .setInput(f1, f2).getOutput() - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) val rSmart = transformed.collect(smartVectorized) @@ -171,7 +190,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att .setHashSpaceStrategy(HashSpaceStrategy.Separate) .setInput(f1, f2).getOutput() - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) assertNominal(field, Array.fill(8)(false) ++ Array(true, true), transformed.collect(smartVectorized)) @@ -205,7 +224,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att .setNumFeatures(4).setHashSpaceStrategy(HashSpaceStrategy.Shared) .setInput(f1, f2).getOutput() - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) assertNominal(field, Array.fill(4)(false) ++ Array(true, true), transformed.collect(smartVectorized)) @@ -242,7 +261,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att .setNumFeatures(TransmogrifierDefaults.MaxNumOfFeatures).setHashSpaceStrategy(HashSpaceStrategy.Auto) .setInput(f1, f2).getOutput() - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) val rSmart = transformed.collect(smartVectorized) @@ -282,7 +301,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att others = Array(m2) ) - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, shortcutMapVectorized) val field = transformed.schema(shortcutMapVectorized.name) assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true, @@ -316,7 +335,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att .setCleanKeys(false) .setInput(m3, m4).getOutput() - val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(inputData) val result = transformed.collect(textMapVectorized, textAreaMapVectorized) val field = transformed.schema(textMapVectorized.name) assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true, @@ -352,7 +371,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att .setTrackTextLen(true) .setInput(f1, f2).getOutput() - val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) + val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(inputData) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) @@ -376,4 +395,5 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } + } From aad25d1ff71ad70611f049ce7df8eb88e9dc1359 Mon Sep 17 00:00:00 2001 From: Christopher Suchanek Date: Mon, 29 Apr 2019 17:00:53 -0700 Subject: [PATCH 2/3] fixed test --- .../impl/feature/SmartTextMapVectorizerTest.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index bb32382113..1e1b6b774a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -77,7 +77,6 @@ class SmartTextMapVectorizerTest * Estimator instance to be tested */ override val estimator: SmartTextMapVectorizer[TextMap] = new SmartTextMapVectorizer[TextMap]() - .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) .setCleanKeys(false) .setInput(m1, m2) @@ -85,11 +84,11 @@ class SmartTextMapVectorizerTest * Expected result of the transformer applied on the Input Dataset */ override val expectedResult: Seq[OPVector] = Seq( - Vectors.sparse(9, Array(0, 5, 7), Array(1.0, 1.0, 1.0)), - Vectors.sparse(9, Array(0, 8), Array(1.0, 1.0)), - Vectors.sparse(9, Array(1, 4), Array(1.0, 1.0)), - Vectors.sparse(9, Array(0, 4), Array(1.0, 2.0)), - Vectors.sparse(9, Array(3, 8), Array(1.0, 1.0)) + Vectors.dense(Array(1.0, 0.0, 1.0, 0.0)), + Vectors.dense(Array(1.0, 0.0, 1.0, 0.0)), + Vectors.dense(Array(1.0, 0.0, 1.0, 0.0)), + Vectors.dense(Array(1.0, 0.0, 1.0, 0.0)), + Vectors.dense(Array(0.0, 1.0, 0.0, 1.0)) ).map(_.toOPVector) @@ -116,6 +115,10 @@ class SmartTextMapVectorizerTest } it should "detect one categorical and one non-categorical text feature" in { + val estimator: SmartTextMapVectorizer[TextMap] = new SmartTextMapVectorizer[TextMap]() + .setMaxCardinality(2).setNumFeatures(4).setMinSupport(1).setTopK(2).setPrependFeatureName(true) + .setCleanKeys(false) + .setInput(m1, m2) val smartMapVectorized = estimator.getOutput() val smartVectorized = new SmartTextVectorizer() From d75d5a810accd52ac65a4bd02720a3ec5f04850b Mon Sep 17 00:00:00 2001 From: Christopher Suchanek Date: Mon, 29 Apr 2019 17:02:48 -0700 Subject: [PATCH 3/3] fixed test --- .../op/stages/impl/feature/SmartTextMapVectorizerTest.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 1e1b6b774a..b42a9b7dc5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -77,7 +77,6 @@ class SmartTextMapVectorizerTest * Estimator instance to be tested */ override val estimator: SmartTextMapVectorizer[TextMap] = new SmartTextMapVectorizer[TextMap]() - .setCleanKeys(false) .setInput(m1, m2) /**