From 12ce67746b3093e2e242f53cfe7d34980b1ec2a8 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Tue, 4 Sep 2018 17:09:35 -0700 Subject: [PATCH 01/15] Adding Attributes when converting to Metadata --- .../op/utils/spark/OpVectorMetadata.scala | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 5c693f5e3a..7b960eb439 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -31,6 +31,7 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory +import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} @@ -38,8 +39,9 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} * Represents a metadata wrapper that includes parent feature information. * * The metadata includes a columns field that describes the columns in the vector. - * @param name name of the feature vector - * @param col information about each element in the vector + * + * @param name name of the feature vector + * @param col information about each element in the vector * @param history history of parent features used to create the vector map is from * OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory */ @@ -53,7 +55,7 @@ class OpVectorMetadata private /** * Column metadata with indicies fixed to match order passed in */ - val columns: Array[OpVectorColumnMetadata] = col.zipWithIndex.map{ case(c, i) => c.copy(index = i) } + val columns: Array[OpVectorColumnMetadata] = col.zipWithIndex.map { case (c, i) => c.copy(index = i) } /** * Get the number of columns in vectors of this type @@ -82,12 +84,24 @@ class OpVectorMetadata private val groupedCol = columns .groupBy(c => (c.parentFeatureName, c.parentFeatureType, c.grouping, c.indicatorValue, c.descriptorValue)) val colData = groupedCol.toSeq - .map{ case (_, g) => g.head -> g.map(_.index) } - val colMeta = colData.map{ case (c, i) => c.toMetadata(i) } - new MetadataBuilder() + .map { case (_, g) => g.head -> g.map(_.index) } + val colMeta = colData.map { case (c, i) => + c.toMetadata(i) + } + val meta = new MetadataBuilder() .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray) .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) .build() + val nominalValues = columns + .groupBy(c => (c.parentFeatureName, c.parentFeatureType, c.grouping)) + .mapValues(_.filter(_.indicatorValue.isDefined).map(_.indicatorValue.get).distinct) + val attributes = columns.map { c => + if (c.indicatorValue.isDefined) BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) + else { + NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) + } + } + new AttributeGroup(name, attributes).toMetadata(meta) } /** @@ -102,10 +116,11 @@ class OpVectorMetadata private /** * Extract the full history for each element of the vector + * * @return Sequence of [[OpVectorColumnHistory]] */ def getColumnHistory(): Seq[OpVectorColumnHistory] = { - columns.map{ c => + columns.map { c => val hist = c.parentFeatureName.map(pn => history.getOrElse(pn, throw new RuntimeException(s"Parent feature name '${pn}' has no associated history"))) val histComb = hist.head.merge(hist.tail: _*) @@ -161,6 +176,7 @@ class OpVectorMetadata private } object OpVectorMetadata { + import com.salesforce.op.utils.spark.RichMetadata._ val ColumnsKey = "vector_columns" From 1ac45ebdb848b859ec83dfb8dac11d686113724f Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Wed, 5 Sep 2018 13:54:52 -0700 Subject: [PATCH 02/15] Treated Text types as special uses cases --- .../com/salesforce/op/utils/spark/OpVectorMetadata.scala | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 7b960eb439..874a4bf6bd 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -31,6 +31,7 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory +import com.salesforce.op.features.types.{Binary, BinaryMap, Text, TextArea, TextAreaMap, TextMap} import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} @@ -74,7 +75,7 @@ class OpVectorMetadata private newColumns: Array[OpVectorColumnMetadata] ): OpVectorMetadata = OpVectorMetadata(name, newColumns, history) - + val textTypes = Seq(Text, TextArea, TextAreaMap, TextMap, Binary, BinaryMap).map(_.getClass.toString.dropRight(1)) /** * Serialize to spark metadata * @@ -92,11 +93,9 @@ class OpVectorMetadata private .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray) .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) .build() - val nominalValues = columns - .groupBy(c => (c.parentFeatureName, c.parentFeatureType, c.grouping)) - .mapValues(_.filter(_.indicatorValue.isDefined).map(_.indicatorValue.get).distinct) val attributes = columns.map { c => - if (c.indicatorValue.isDefined) BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) + if (c.indicatorValue.isDefined || textTypes.exists(c.parentFeatureType.contains)) + BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) else { NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) } From 9c7d9ff0356b7663523c8f29a6d853dabd591989 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Wed, 5 Sep 2018 16:25:27 -0700 Subject: [PATCH 03/15] FirstTest --- .../impl/feature/AttributeTestUtils.scala | 16 ++++++++++ .../impl/feature/DateListVectorizerTest.scala | 32 +++++++++++++++---- .../op/utils/spark/OpVectorMetadata.scala | 4 +-- 3 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala new file mode 100644 index 0000000000..a114d2aef2 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala @@ -0,0 +1,16 @@ +package com.salesforce.op.stages.impl.feature + +import org.apache.spark.ml.attribute.AttributeGroup +import org.apache.spark.sql.types.StructField +import org.junit.runner.RunWith +import org.scalatest.Matchers +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +object AttributeTestUtils extends Matchers{ + + final def assert(schema: StructField, expectedNominal: Array[Boolean]) = { + val attributes = AttributeGroup.fromStructField(schema).attributes.get + attributes.map(_.isNominal) shouldBe expectedNominal + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala index 4cdf0413c0..92d99ae850 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala @@ -37,6 +37,7 @@ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVect import com.salesforce.op.utils.date.DateTimeUtils import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichDataset._ +import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.Vectors import org.joda.time.{DateTime, DateTimeConstants} import org.junit.runner.RunWith @@ -51,7 +52,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val now = TransmogrifierDefaults.ReferenceDate.minusMillis(1).getMillis // make date time be in the past private def daysToMilliseconds(n: Int): Long = n * DateTimeConstants.MILLIS_PER_DAY + private def monthsToMilliseconds(n: Int): Long = n * 2628000000L + private def hoursToMilliseconds(n: Int): Long = n * DateTimeConstants.MILLIS_PER_HOUR val (testData, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases", @@ -122,7 +125,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.dense(2.0, 1.0, -1.0).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -148,7 +153,10 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.dense(2.0, 0.0, 1.0, 0.0, -1.0, 0.0).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size) + (Seq(false, true)).flatten) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -174,7 +182,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.dense(-28.0, -29.0, -31.0).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -196,7 +206,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(21, Array(), Array()).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday").map(s => @@ -225,7 +237,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(24, Array(7, 15, 23), Array(1.0, 1.0, 1.0)).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", @@ -253,7 +267,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(36, Array(), Array()).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true)) testModelModeMonth.getMetadata() shouldEqual fieldMetadata val months = List( @@ -283,7 +299,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(72, Array(), Array()).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + AttributeTestUtils.assert(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true)) testModelModeHour.getMetadata() shouldEqual fieldMetadata val hours = (0 until 24).map(i => IndCol(Some(s"$i:00"))).toList diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 874a4bf6bd..0c4288a7c6 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -94,9 +94,9 @@ class OpVectorMetadata private .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) .build() val attributes = columns.map { c => - if (c.indicatorValue.isDefined || textTypes.exists(c.parentFeatureType.contains)) + if (c.indicatorValue.isDefined || textTypes.exists(c.parentFeatureType.contains)) { BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) - else { + } else { NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) } } From f7546db8985fee486514ba7eedf1d1030e4fa9ce Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Thu, 6 Sep 2018 11:25:39 -0700 Subject: [PATCH 04/15] Tested on DateVectorizers --- .../stages/impl/feature/AttributeTestUtils.scala | 2 +- .../stages/impl/feature/Base64VectorizerTest.scala | 2 ++ .../impl/feature/BinaryMapVectorizerTest.scala | 2 ++ .../stages/impl/feature/BinaryVectorizerTest.scala | 8 ++++++++ .../impl/feature/DateListVectorizerTest.scala | 14 +++++++------- .../DateMapToUnitCircleVectorizerTest.scala | 4 ++++ .../impl/feature/DateMapVectorizerTest.scala | 7 +++++++ .../impl/feature/DateTimeVectorizerTest.scala | 7 +++++++ .../feature/DateToUnitCircleTransformerTest.scala | 11 ++++++++++- .../stages/impl/feature/DateVectorizerTest.scala | 7 +++++++ .../op/utils/spark/OpVectorMetadata.scala | 2 +- 11 files changed, 56 insertions(+), 10 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala index a114d2aef2..7d4886a137 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala @@ -9,7 +9,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) object AttributeTestUtils extends Matchers{ - final def assert(schema: StructField, expectedNominal: Array[Boolean]) = { + final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]) = { val attributes = AttributeGroup.fromStructField(schema).attributes.get attributes.map(_.isNominal) shouldBe expectedNominal } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala index c518495c4b..d2ca4864f4 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala @@ -63,6 +63,8 @@ class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64Tes def assertVectorizer(vec: FeatureLike[OPVector], expected: Seq[Text]): Unit = { val result = new OpWorkflow().setResultFeatures(vec).transform(realData) val vectors = result.collect(vec) + val schema = result.schema(vec.name) + AttributeTestUtils.assertNominal(schema, Array.fill(vectors.head.value.size)(true)) vectors.length shouldBe expected.length // TODO add a more robust check diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala index c4e79457b9..8d2900897f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala @@ -73,6 +73,7 @@ class BinaryMapVectorizerTest transformed.collect(vector) shouldBe expectedResult val field = transformed.schema(estimator.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(true)) OpVectorMetadata(field) shouldEqual expectedMeta val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta @@ -100,6 +101,7 @@ class BinaryMapVectorizerTest transformed.collect(vector) shouldBe expected val field = transformed.schema(estimator.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) OpVectorMetadata(field) shouldEqual expectedMeta val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala index 47b2ff50cb..40a8299ff2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala @@ -93,6 +93,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))), f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))) ) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=true,fillValue=true]" in { @@ -117,6 +119,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))), f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))) ) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=false,fillValue=false]" in { @@ -141,6 +145,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol), f2 -> List(RootCol) ) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=false,fillValue=true]" in { @@ -165,5 +171,7 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol), f2 -> List(RootCol) ) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala index 92d99ae850..3e4dc909e7 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala @@ -127,7 +127,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -155,7 +155,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size) (Seq(false, true)).flatten) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -184,7 +184,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -208,7 +208,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true)) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday").map(s => @@ -239,7 +239,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true)) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", @@ -269,7 +269,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true)) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true)) testModelModeMonth.getMetadata() shouldEqual fieldMetadata val months = List( @@ -301,7 +301,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assert(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true)) + AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true)) testModelModeHour.getMetadata() shouldEqual fieldMetadata val hours = (0 until 24).map(i => IndCol(Some(s"$i:00"))).toList diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala index 046d8f7f0f..a1ac9081f7 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala @@ -77,7 +77,9 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen val output = f1.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) + val field = transformed.schema(output.name) val actual = transformed.collect(output) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } @@ -88,7 +90,9 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen val output = f1DT.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) + val field = transformed.schema(output.name) val actual = transformed.collect(output) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala index c68b25ae5b..87eda4639f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala @@ -71,6 +71,8 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata) meta.columns.length shouldBe 3 meta.columns.map(_.grouping) should contain theSameElementsAs Array(Option("a"), Option("b"), Option("c")) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) val vector2 = f1.vectorize(defaultValue = 0, referenceDate = moment, trackNulls = true, circularDateReps = Seq()) @@ -80,6 +82,8 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata) meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 1 + val field2 = transformed2.schema(vector2.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize(defaultValue = 0) val transformed3 = new OpWorkflow().setResultFeatures(vector3).transform(ds) @@ -88,6 +92,9 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata) meta3.columns.length shouldBe 30 meta2.history.keys.size shouldBe 1 + val field3 = transformed3.schema(vector3.name) + val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] + AttributeTestUtils.assertNominal(field3, expectedNominal) } private def expected(moment: JDateTime) = { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala index 2dbf2603dd..762fd1ed2f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala @@ -91,6 +91,8 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata) meta.columns.length shouldBe 3 meta.history.keys.size shouldBe 3 + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) val vector2 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -105,6 +107,8 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata) meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 3 + val field2 = transformed2.schema(vector2.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -117,6 +121,9 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata) meta3.columns.length shouldBe 30 meta3.history.keys.size shouldBe 6 + val field3 = transformed3.schema(vector3.name) + val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] + AttributeTestUtils.assertNominal(field3, expectedNominal) } it should "vectorize dates correctly any time" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala index 4a3de8f06b..565decc3be 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala @@ -66,7 +66,10 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val vectorizer = new DateToUnitCircleTransformer().setTimePeriod(timePeriod).setInput(f) val transformed = vectorizer.transform(ds) val vector = vectorizer.getOutput() - transformed.collect(vector) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + actual } def indexSeqToUnitCircle(indices: Seq[Int], numIndices: Int): Seq[OPVector] = { @@ -81,6 +84,8 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + val field = transformed.schema(output.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "work with its DateTime shortcut" in { @@ -90,6 +95,8 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + val field = transformed.schema(output.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "store the proper meta data" in { @@ -118,6 +125,8 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo Array(1.0, 0.0) ).map(Vectors.dense(_).toOPVector) all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "transform the data correctly when the timePeriod is HourOfDay" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala index a44b75a22d..730394a62c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala @@ -69,6 +69,8 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata) meta.columns.length shouldBe 3 meta.history.keys.size shouldBe 3 + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expectedAt(moment).head.value.size)(false)) val vector2 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -83,6 +85,8 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata) meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 3 + val field2 = transformed2.schema(vector2.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expectedAt(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -94,6 +98,9 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata) meta3.columns.length shouldBe 30 meta3.history.keys.size shouldBe 6 + val field3 = transformed3.schema(vector3.name) + val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] + AttributeTestUtils.assertNominal(field3, expectedNominal) } private def buildTestData(moment: DateTime) = { diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 0c4288a7c6..5cc0f5a1e4 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -75,7 +75,7 @@ class OpVectorMetadata private newColumns: Array[OpVectorColumnMetadata] ): OpVectorMetadata = OpVectorMetadata(name, newColumns, history) - val textTypes = Seq(Text, TextArea, TextAreaMap, TextMap, Binary, BinaryMap).map(_.getClass.toString.dropRight(1)) + val textTypes = Seq(Text, TextArea, TextAreaMap, TextMap, Binary, BinaryMap).map(_.getClass.getName.dropRight(1)) /** * Serialize to spark metadata * From 96a1d8c5459e704f9d0629e5dc1a01dce7935538 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Thu, 6 Sep 2018 13:24:42 -0700 Subject: [PATCH 05/15] Up to GeoLocationTests --- .../DecisionTreeNumericBucketizerTest.scala | 7 +++-- ...DecisionTreeNumericMapBucketizerTest.scala | 2 ++ .../DropIndicesByTransformerTest.scala | 10 +++++-- .../impl/feature/EmailVectorizerTest.scala | 9 ++++-- .../GeolocationMapVectorizerTest.scala | 28 ++++++++++++++++++- .../feature/GeolocationVectorizerTest.scala | 15 ++++++++-- 6 files changed, 62 insertions(+), 9 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala index b2d916e8f1..a047277aaa 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala @@ -203,8 +203,11 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, val splits = model.splits assertSplits(splits = splits, expectedSplits = expectedSplits, expectedTolerance) - val res = model.transform(data).collect(out) - assertMetadata( + val transformed = model.transform(data) + val res = transformed.collect(out) + val field = transformed.schema(out.name) + AttributeTestUtils.assertNominal(field, Array.fill(res.head.value.size)(true)) + assertMetadata( shouldSplit = Array(shouldSplit), splits = Array(splits), trackNulls = trackNulls, trackInvalid = trackInvalid, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala index 512a30c01e..744546117c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala @@ -228,6 +228,8 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, ) val scored = model.setInputDataset(data).score(keepIntermediateFeatures = true) val res = scored.collect(out) + val field = scored.schema(out.name) + AttributeTestUtils.assertNominal(field, Array.fill(res.head.value.size)(true)) assertMetadata( shouldSplit = stage.shouldSplitByKey.toArray.sortBy(_._1).map(_._2), splits = stage.splitsByKey.toArray.sortBy(_._1).map(_._2), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala index f73cedccc0..d56c50cbdf 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala @@ -73,8 +73,11 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic .setInput(vectorizedPicklist) .getOutput() val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) + val field = materializedFeatures.schema(prunedVector.name) + val collectedFeatures = materializedFeatures.collect(prunedVector) + AttributeTestUtils.assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) - materializedFeatures.collect(prunedVector).foreach(_.value.size shouldBe 4) + collectedFeatures.foreach(_.value.size shouldBe 4) materializedFeatures.collect().foreach { r => if (r.getString(0) == "Red") r.getAs[Vector](2).toArray.forall(_ == 0) shouldBe true else r.getAs[Vector](2).toArray.max shouldBe 1 @@ -89,8 +92,11 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic val vectorizedPicklist = picklistFeature.vectorize(topK = 10, minSupport = 3, cleanText = false) val prunedVector = vectorizedPicklist.dropIndicesBy(_.isNullIndicator) val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) + val field = materializedFeatures.schema(prunedVector.name) + val collectedFeatures = materializedFeatures.collect(prunedVector) + AttributeTestUtils.assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) - materializedFeatures.collect(prunedVector).foreach(_.value.size shouldBe 4) + collectedFeatures.foreach(_.value.size shouldBe 4) materializedFeatures.collect().foreach( _.getAs[Vector](2).toArray.max shouldBe 1) val rawMeta = OpVectorMetadata(vectorizedPicklist.name, vectorizedPicklist.originStage.getMetadata()) val trimmedMeta = OpVectorMetadata(materializedFeatures.schema(prunedVector.name)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala index b193403327..2bcb98880d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala @@ -80,8 +80,13 @@ class EmailVectorizerTest ).map(_.toOPVector) - def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = - new OpWorkflow().setResultFeatures(feature).transform(ds).collect(feature) + def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = { + val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) + val field = transformed.schema(feature.name) + val collected = transformed.collect(feature) + AttributeTestUtils.assertNominal(field, Array.fill(collected.head.value.size)(true)) + collected + } Spec[RichEmailMapFeature] should "vectorize EmailMaps correctly" in { val (ds1, f1) = TestFeatureBuilder(emails.map(e => Map(emailKey -> e.value.get).toEmailMap)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala index bdf3674c58..2072936cea 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala @@ -93,7 +93,8 @@ class GeolocationMapVectorizerTest val vectorizer = estimator.fit(inputData) val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -112,6 +113,9 @@ class GeolocationMapVectorizerTest Vectors.sparse(24, Array(3, 7, 11, 15, 19, 23), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -128,6 +132,8 @@ class GeolocationMapVectorizerTest Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0, 6.0, 6.0, 6.0), Array.fill(18)(6.0) ).map(v => Vectors.dense(v).toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -147,6 +153,9 @@ class GeolocationMapVectorizerTest 0.0, 6.0, 6.0, 6.0, 1.0), (0 until 6).flatMap(k => Seq.fill(3)(6.0) :+ 1.0).toArray ).map(v => Vectors.dense(v).toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -164,6 +173,9 @@ class GeolocationMapVectorizerTest Vectors.sparse(9, Array(), Array()), Vectors.sparse(9, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> (Geolocation.Names.map(n => DescColWithGroup(Option(n), "A")) ++ @@ -188,6 +200,10 @@ class GeolocationMapVectorizerTest Vectors.sparse(12, Array(3, 7, 11), Array(1.0, 1.0, 1.0)), Vectors.sparse(12, Array(3, 7, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> ( @@ -215,6 +231,9 @@ class GeolocationMapVectorizerTest Vectors.dense(Array(0.0, 0.0, 0.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0)), Vectors.sparse(12, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> (Geolocation.Names.map(n => DescColWithGroup(Option(n), "B")) ++ @@ -239,6 +258,10 @@ class GeolocationMapVectorizerTest Vectors.dense(Array(0.0, 0.0, 0.0, 1.0, 33.8, -108.7, 2.0, 0.0, 40.4, -116.3, 2.0, 0.0, 42.5, -95.4, 4.0, 0.0)), Vectors.sparse(16, Array(3, 7, 11, 15), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> ( @@ -263,6 +286,9 @@ class GeolocationMapVectorizerTest val vector = vectorizer.getOutput() val expectedOutput = transformed.collect() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size / 4) + (Seq(false, false, false, true)).flatten) // Now using the shortcut val res = m1.vectorize(cleanKeys = TransmogrifierDefaults.CleanKeys, others = Array(m2)) res.originStage shouldBe a[GeolocationMapVectorizer] diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala index 305d6749c5..ee0bc32fb2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala @@ -39,6 +39,7 @@ import org.apache.spark.ml.linalg.{DenseVector, Vectors} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} +import com.salesforce.op.utils.spark.RichDataset._ @RunWith(classOf[JUnitRunner]) @@ -97,6 +98,10 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { (Array(45.0, -105.5, 4.0), null, null, null, Vectors.dense(45.0, -105.5, 4.0, 50.0, 50.0, 4.0, 50.0, 50.0, 4.0, 50.0, 50.0, 4.0)) ) + val output = testModelConstant.getOutputFeatureName + val field = testDataTransformedConstant.schema(output) + AttributeTestUtils.assertNominal( + field, Array.fill(expectedConstant.head._5.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedConstant.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedConstant.map(_._2) @@ -129,7 +134,10 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { (Array(45.0, -105.5, 4.0), null, null, null, Vectors.dense(Array(45.0, -105.5, 4.0) ++ mean1 ++ mean2 ++ mean3)) ) - + val output = testModelMean.getOutputFeatureName + val field = testDataTransformedMean.schema(output) + AttributeTestUtils.assertNominal( + field, Array.fill(expectedMean.head._5.size)(false)) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) @@ -169,7 +177,10 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { (Array(45.0, -105.5, 4.0), null, null, null, Vectors.dense(Array(45.0, -105.5, 4.0, 0.0) ++ mean1 ++ mean2 ++ mean3)) ) - + val output = testModelMean.getOutputFeatureName + val field = testDataTransformedMean.schema(output) + AttributeTestUtils.assertNominal( + field, Array.fill(expectedMean.head._5.size / 4)(Seq(false, false, false, true)).flatten) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) From 0531c68c0ebd33e1ab1a75435dbc9a76f9a676fe Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Thu, 6 Sep 2018 15:38:09 -0700 Subject: [PATCH 06/15] Up to NumericVectorizerTest --- .../feature/IntegralMapVectorizerTest.scala | 16 ++++ .../impl/feature/IntegralVectorizerTest.scala | 16 +++- .../feature/MultiPickMapVectorizerTest.scala | 46 +++++++++++ .../impl/feature/NumericBucketizerTest.scala | 76 +++++++++++++++---- .../impl/feature/NumericVectorizerTest.scala | 16 +++- 5 files changed, 153 insertions(+), 17 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala index 2f54a056be..613afa07ca 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala @@ -83,6 +83,8 @@ class IntegralMapVectorizerTest it should "return a model that correctly transforms the data and produces metadata" in { val vector = estimator.getOutput() val transformed = model.transform(inputData) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(estimator.getOutputFeatureName) shouldEqual expectedMeta @@ -100,6 +102,8 @@ class IntegralMapVectorizerTest Vectors.sparse(12, Array(1, 3, 4, 8, 11), Array(1.0, 1.0, 11.0, 3.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -117,6 +121,8 @@ class IntegralMapVectorizerTest Vectors.dense(Array(100.0, 100.0, 11.0, 0.0, 3.0, 100.0)), Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -134,6 +140,8 @@ class IntegralMapVectorizerTest Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 11.0, 0.0, 0.0, 0.0, 3.0, 0.0, 100.0, 1.0)), Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -153,6 +161,8 @@ class IntegralMapVectorizerTest Vectors.sparse(3, Array(), Array()), Vectors.sparse(3, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B")), @@ -177,6 +187,8 @@ class IntegralMapVectorizerTest Vectors.sparse(6, Array(1, 3, 5), Array(1.0, 1.0, 1.0)), Vectors.sparse(6, Array(1, 3, 5), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(3)(Seq(false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), @@ -201,6 +213,8 @@ class IntegralMapVectorizerTest Vectors.dense(Array(0.0, 11.0, 0.0, 3.0)), Vectors.sparse(4, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), @@ -224,6 +238,8 @@ class IntegralMapVectorizerTest Vectors.sparse(8, Array(1, 2, 6), Array(1.0, 11.0, 3.0)), Vectors.sparse(8, Array(1, 3, 5, 7), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(4)(Seq(false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala index 6d6db538a0..d7946fb782 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala @@ -113,7 +113,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2L, 2L, null, Vectors.dense(3.0, 2.0, 2.0, 3.0)), (null, null, null, null, Vectors.dense(3.0, 3.0, 3.0, 3.0)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._5.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -142,7 +143,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) - + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) transformedValuesMode.map(_.get(2)) shouldEqual expectedMode.map(_._3) @@ -168,6 +170,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 1.0)), (null, null, null, null, Vectors.dense(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)) ) + val field = testDataTransformedConstantTracked.schema(testModelConstantTracked.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedZeroTracked.head._5.size / 2)(Seq(false, true)).flatten) transformedValuesZeroTracked.map(_.get(0)) shouldEqual expectedZeroTracked.map(_._1) transformedValuesZeroTracked.map(_.get(1)) shouldEqual expectedZeroTracked.map(_._2) @@ -206,6 +210,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 1.0)), (null, null, null, null, Vectors.dense(4.0, 1.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0)) ) + val field = testDataTransformedModeTracked.schema(testModelModeTracked.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedModeTracked.head._5.size / 2)(Seq(false, true)).flatten) transformedValuesModeTracked.map(_.get(0)) shouldEqual expectedModeTracked.map(_._1) transformedValuesModeTracked.map(_.get(1)) shouldEqual expectedModeTracked.map(_._2) @@ -248,6 +254,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) @@ -278,6 +286,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) @@ -308,6 +318,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickMapVectorizerTest.scala index 11f9f6074e..eae66ffc46 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickMapVectorizerTest.scala @@ -95,6 +95,10 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { it should "return the a fitted vectorizer with the correct default parameters" in { val fitted = vectorizer.setTrackNulls(false).fit(dataSet) fitted shouldBe a[SequenceModel[_, _]] + val transformed = fitted.transform(dataSet) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -116,6 +120,10 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { it should "track nulls with the correct default parameters" in { val fitted = vectorizer.setTrackNulls(true).fit(dataSet) fitted shouldBe a[SequenceModel[_, _]] + val transformed = fitted.transform(dataSet) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -152,6 +160,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 7, 9), Array(1.0, 1.0, 1.0)), Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -168,6 +178,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(20, Array(0, 6, 9, 10, 13, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(20, Array(0, 3, 9, 12, 15, 16), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -184,6 +196,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(17, Array(0, 9, 11), Array(1.0, 1.0, 1.0)), Vectors.sparse(17, Array(1, 3, 14), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -213,6 +227,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(23, Array(0, 7, 10, 12, 15, 22), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(23, Array(1, 4, 10, 14, 18, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -244,6 +260,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(12, Array(0, 6, 8), Array(1.0, 1.0, 1.0)), Vectors.sparse(12, Array(0, 2, 10), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -259,6 +277,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(18, Array(0, 5, 8, 9, 12, 17), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(18, Array(0, 3, 8, 11, 14, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -272,6 +292,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 5, 7), Array(1.0, 1.0, 1.0)), Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -285,6 +307,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(16, Array(0, 5, 7, 8, 11, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(16, Array(0, 3, 7, 10, 13, 14), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -299,6 +323,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected @@ -309,6 +335,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expected2.head.value.size)(true)) transformed2.collect(vector) shouldBe expected2 } @@ -323,6 +351,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected @@ -333,6 +363,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expected2.head.value.size)(true)) transformed2.collect(vector) shouldBe expected2 } @@ -345,6 +377,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(Array.empty[Double]), Vectors.dense(Array.empty[Double]) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -362,6 +396,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(5, Array(3), Array(1.0)), Vectors.sparse(5, Array(0), Array(1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -379,6 +415,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(7, Array(3, 4), Array(1.0, 1.0)), Vectors.sparse(7, Array(0, 6), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -396,6 +434,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(9, Array(0, 7), Array(1.0, 1.0)), Vectors.sparse(9, Array(0, 4), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -413,6 +453,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(0, 5, 9, 10), Array(1.0, 1.0, 1.0, 1.0)), Vectors.sparse(13, Array(0, 5, 6, 12), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -432,6 +474,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(8, Array(3, 7), Array(2.0, 1.0)), Vectors.sparse(8, Array(0, 2, 3, 4, 5, 6), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -451,6 +495,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(3, 8), Array(2.0, 1.0)), Vectors.dense(1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala index b313edd76f..4370c95e2b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala @@ -158,7 +158,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { buck.getBucketLabels shouldBe Array("A", "B", "C") } - it should "throw an exception if the data is out of bounds when trackInvalid is false" in new GenericTest { + it should "throw an exception if the data is out of bounds when trackInvalid is false" in new GenericTest { val vals = Seq(Double.PositiveInfinity, Double.NaN, -1, -100).map(_.toReal) lazy val (data, num) = TestFeatureBuilder("num", vals) val buck = new NumericBucketizer[Real]().setInput(num).setBuckets(splits).setTrackInvalid(false) @@ -168,7 +168,11 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly (reals)" in new RealTest { val vector = realBucketizer.getOutput() - realBucketizer.transform(data1).collect(vector) shouldBe expectedAns + val transformed = realBucketizer.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe expectedAns val expectedMeta = TestOpVectorMetadataBuilder( realBucketizer, @@ -182,19 +186,31 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { IndCol(Some("[5.0-10.0)")), IndCol(Some("[10.0-Infinity)"))) ) OpVectorMetadata(realBucketizer2.getOutputFeatureName, realBucketizer2.getMetadata()) shouldEqual expectedMeta2 + val vector2 = realBucketizer2.getOutput() + val transformed2 = realBucketizer2.transform(data1) + val actual2 = transformed2.collect(vector2) + val field2 = transformed2.schema(vector2.name) + AttributeTestUtils.assertNominal(field2, Array.fill(actual2.head.value.size)(true)) } it should "work as a shortcut (reals)" in new RealTest { val vector = num.bucketize(trackNulls = false, splits = splits, bucketLabels = bucketLabels) vector.originStage shouldBe a[NumericBucketizer[_]] val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - - buck.transform(data1).collect(vector) shouldBe expectedAns + val transformed = buck.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe expectedAns } it should "keep track of null values if wanted (reals) " in new RealTest { val vector = trackNullsRealBucketizer.getOutput() - trackNullsRealBucketizer.transform(data1).collect(vector) shouldBe trackNullsExpectedAns + val transformed = trackNullsRealBucketizer.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe trackNullsExpectedAns val expectedMeta = TestOpVectorMetadataBuilder( trackNullsRealBucketizer, @@ -216,12 +232,21 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { trackNullsRealBucketizer2.getOutputFeatureName, trackNullsRealBucketizer2.getMetadata() ) shouldEqual expectedMeta2 + val vector2 = trackNullsRealBucketizer2.getOutput() + val transformed2 = trackNullsRealBucketizer2.transform(data1) + val actual2 = transformed2.collect(vector2) + val field2 = transformed2.schema(vector2.name) + AttributeTestUtils.assertNominal(field2, Array.fill(actual2.head.value.size)(true)) } it should "allow right inclusive splits (reals)" in new RealTest { val vector = num.bucketize(trackNulls = false, splits = splitsRightInclusive, splitInclusion = Inclusion.Right) val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - buck.transform(data1).collect(vector) shouldBe expectedRightInclusiveAns + val transformed = buck.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe expectedRightInclusiveAns } it should "correctly bucketize some random reals" in { @@ -235,9 +260,13 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { splits = Array(Double.NegativeInfinity, 0.0, Double.PositiveInfinity), splitInclusion = Inclusion.Left ) + val bucketizer = buck.originStage + val transformed = bucketizer.asInstanceOf[NumericBucketizer[_]].transform(ds) + val results = transformed.collect(buck) + val field = transformed.schema(buck.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) - buck.originStage shouldBe a[NumericBucketizer[_]] - val results = buck.originStage.asInstanceOf[NumericBucketizer[_]].transform(ds).collect(buck) + bucketizer shouldBe a[NumericBucketizer[_]] val (neg, pos, empty) = (Vectors.dense(1.0, 0.0, 0.0).toOPVector, @@ -256,7 +285,8 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { ) val buck = num.bucketize(trackNulls = true, trackInvalid = true, splits = Array(0.0, 1.0, 5.0)) val stage = buck.originStage.asInstanceOf[NumericBucketizer[_]] - val results = stage.transform(ds).collect(buck) + val transformed = stage.transform(ds) + val results = transformed.collect(buck) results shouldBe Seq( Vectors.dense(0.0, 0.0, 0.0, 1.0), @@ -266,6 +296,9 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 1.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(buck.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + val expectedMeta = TestOpVectorMetadataBuilder( stage, num -> List(IndCol(Some("[0.0-1.0)")), IndCol(Some("[1.0-5.0)")), IndCol(Some(TransmogrifierDefaults.OtherString)), IndCol(Some(TransmogrifierDefaults.NullString)) @@ -276,8 +309,12 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly (integrals)" in new IntegralTest { val vector = integralBucketizer.getOutput() - integralBucketizer.transform(data1).collect(vector) shouldBe expectedAns + val transformed = integralBucketizer.transform(data1) + val results = transformed.collect(vector) + results shouldBe expectedAns + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( integralBucketizer, num -> List(IndCol(Some("0-1")), IndCol(Some("1-5")), IndCol(Some("5-10")), IndCol(Some("10-Infinity"))) @@ -289,12 +326,20 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val vector = num.bucketize(trackNulls = false, splits = splits, bucketLabels = bucketLabels) vector.originStage shouldBe a[NumericBucketizer[_]] val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - buck.transform(data1).collect(vector) shouldBe expectedAns + val transformed = buck.transform(data1) + val results = transformed.collect(vector) + results shouldBe expectedAns + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) } it should "keep track of null values if wanted (integrals)" in new IntegralTest { val vector = trackNullsIntegralBucketizer.getOutput() - trackNullsIntegralBucketizer.transform(data1).collect(vector) shouldBe trackNullsExpectedAns + val transformed = trackNullsIntegralBucketizer.transform(data1) + val results = transformed.collect(vector) + results shouldBe trackNullsExpectedAns + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( trackNullsIntegralBucketizer, @@ -310,7 +355,12 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "allow right inclusive splits (integrals)" in new IntegralTest { val vector = num.bucketize(trackNulls = false, splits = splitsRightInclusive, splitInclusion = Inclusion.Right) val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - buck.transform(data1).collect(vector) shouldBe expectedRightInclusiveAns + val transformed = buck.transform(data1) + val results = transformed.collect(vector) + results shouldBe expectedRightInclusiveAns + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala index e6b2c98644..8d1cbad6ca 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala @@ -75,6 +75,8 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { Array(3.0, 0.0, 0.0, 1.0), Array(4.0, 0.0, 0.0, 1.0) ).map(Vectors.dense(_).toOPVector) + val field = vectorized.schema(autoBucketFeature.name) + AttributeTestUtils.assertNominal(field, false +: Array.fill(expected.head.value.size - 1)(true)) vectorized.collect(autoBucketFeature) should contain theSameElementsAs expected } it should "vectorize single real feature with a label" in { @@ -86,7 +88,9 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { age.autoBucketize(labelData, trackNulls = false) ).combine() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) - + val field = vectorized.schema(autoBucketFeature.name) + AttributeTestUtils.assertNominal(field, false +: + Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 1)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray } @@ -101,7 +105,9 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { height, height.autoBucketize(labelData, trackNulls = false) ).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) - + val field = vectorized.schema(autoBucketFeature.name) + AttributeTestUtils.assertNominal(field, Array(false, true, false) ++ + Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 3)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray } @@ -118,6 +124,12 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { val autoBucketFeature = Seq(count).transmogrify(label = Some(labelData)) val manualBucketFeature = Seq(count, count.autoBucketize(labelData, trackNulls = false)).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) + val field = vectorized.schema(autoBucketFeature.name) + AttributeTestUtils.assertNominal(field, false +: + Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 1)(true)) + val field2 = vectorized.schema(manualBucketFeature.name) + AttributeTestUtils.assertNominal(field2, false +: + Array.fill(vectorized.collect(manualBucketFeature).head.value.size -1)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray From 8c64851e3a110ee49f8614fe5e792140eed58640 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Thu, 6 Sep 2018 17:06:56 -0700 Subject: [PATCH 07/15] Up to OpMapVectorizer --- .../OPCollectionHashingVectorizerTest.scala | 24 ++++++++++ .../impl/feature/OPMapVectorizerTest.scala | 48 ++++++++++++++----- .../op/utils/spark/OpVectorMetadata.scala | 6 ++- 3 files changed, 64 insertions(+), 14 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala index 5b90b286c5..99c72fb0ac 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala @@ -104,6 +104,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -139,6 +141,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -176,6 +180,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -206,6 +212,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -231,6 +239,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe true @@ -264,6 +274,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe true @@ -294,6 +306,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) + // TODO : Find a way to recognize hashed RealMap has Categoricals vectorizer.isSharedHashSpace shouldBe true @@ -319,6 +332,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) + // TODO : Find a way to recognize hashed RealMap has Categoricals vectorizer.isSharedHashSpace shouldBe true @@ -341,6 +355,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -371,6 +387,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe true @@ -398,6 +416,9 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { .setNumFeatures(10).setHashSpaceStrategy(HashSpaceStrategy.Separate) val feature = vectorizer.getOutput() val transformed = vectorizer.transform(catData) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 20 @@ -412,6 +433,9 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { .setNumFeatures(10).setHashSpaceStrategy(HashSpaceStrategy.Shared) val feature = vectorizer.getOutput() val transformed = vectorizer.transform(catData) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 10 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala index eee1dd3521..0d97888d2e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala @@ -83,7 +83,10 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) - transformed.collect(vectorizer.getOutput()) shouldBe expected + val output = vectorizer.getOutput() + val field = transformed.schema(output.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + transformed.collect(output) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -122,7 +125,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val currencyData3: Seq[Currency] = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Currency, CurrencyMap, Double](currencyData, currencyData2, currencyData3) + testFeatureToMap[Currency, CurrencyMap, Double](currencyData, currencyData2, currencyData3, isCategorical = false) } "Date features" should "be vectorized the same whether they're in maps or not" in { @@ -135,7 +138,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val dateData3: Seq[Date] = RandomIntegral.dates(init = new JDate(1500000000000L), minStep = minSec, maxStep = maxSec).withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Date, DateMap, Long](dateData, dateData2, dateData3) + testFeatureToMap[Date, DateMap, Long](dateData, dateData2, dateData3, false) } "DateTime features" should "be vectorized the same whether they're in maps or not" in { @@ -148,7 +151,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val dateTimeData3: Seq[DateTime] = RandomIntegral.datetimes(init = new JDate(), minStep = minSec, maxStep = maxSec) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[DateTime, DateTimeMap, Long](dateTimeData, dateTimeData2, dateTimeData3) + testFeatureToMap[DateTime, DateTimeMap, Long](dateTimeData, dateTimeData2, dateTimeData3, false) } "Email features" should "be vectorized the same whether they're in maps or not" in { @@ -178,7 +181,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val integralData3: Seq[Integral] = RandomIntegral.integrals(from = -100L, to = 100L) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Integral, IntegralMap, Long](integralData, integralData2, integralData3) + testFeatureToMap[Integral, IntegralMap, Long](integralData, integralData2, integralData3, false) } "MultiPickList features" should "be vectorized the same whether they're in maps or not" in { @@ -197,7 +200,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val percentData2: Seq[Percent] = RandomReal.uniform[Percent]().withProbabilityOfEmpty(0.5).limit(1000) val percentData3: Seq[Percent] = RandomReal.uniform[Percent]().withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Percent, PercentMap, Double](percentData, percentData2, percentData3) + testFeatureToMap[Percent, PercentMap, Double](percentData, percentData2, percentData3, false) } // TODO: Fix failing test @@ -228,7 +231,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val realData2: Seq[Real] = RandomReal.uniform[Real]().withProbabilityOfEmpty(0.5).limit(1000) val realData3: Seq[Real] = RandomReal.uniform[Real]().withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Real, RealMap, Double](realData, realData2, realData3) + testFeatureToMap[Real, RealMap, Double](realData, realData2, realData3, false) } "TextArea features" should "be vectorized the same whether they're in maps or not" in { @@ -317,7 +320,8 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val GeolocationData2: Seq[Geolocation] = RandomList.ofGeolocations.limit(1000) val GeolocationData3: Seq[Geolocation] = RandomList.ofGeolocations.limit(1000) - testFeatureToMap[Geolocation, GeolocationMap, Seq[Double]](GeolocationData, GeolocationData2, GeolocationData3) + testFeatureToMap[Geolocation, GeolocationMap, Seq[Double]](GeolocationData, GeolocationData2, GeolocationData3, + false) } } @@ -330,15 +334,18 @@ object OPMapVectorizerTestHelper extends Matchers { * corresponds to its own key in the OPMap feature. This is used to test whether base feature types are vectorized * the same as their corresponding map types. * - * @param f1Data Sequence of base feature type data (eg. from generators) - * @param f2Data Sequence of base feature type data (eg. from generators) - * @param f3Data Sequence of base feature type data (eg. from generators) + * @param f1Data Sequence of base feature type data (eg. from generators) + * @param f2Data Sequence of base feature type data (eg. from generators) + * @param f3Data Sequence of base feature type data (eg. from generators) + * @param isCategorical If the vector contains categoricals + * @param numberOfContinous It the vector contains categoricals * @tparam F Base feature type (eg. ID, Text, Integer) * @tparam FM OPMap feature type (eg. IDMap, TextMap, IntegerMap) * @tparam MT Value type of map inside OPMap feature (eg. String, String, Int) */ def testFeatureToMap[F <: FeatureType : TypeTag, FM <: OPMap[MT] : TypeTag, MT: TypeTag] - (f1Data: Seq[F], f2Data: Seq[F], f3Data: Seq[F])(implicit spark: SparkSession): Unit = { + (f1Data: Seq[F], f2Data: Seq[F], f3Data: Seq[F], isCategorical: Boolean = true)(implicit spark: SparkSession): + Unit = { val generatedData: Seq[(F, F, F)] = f1Data.zip(f2Data).zip(f3Data).map { case ((f1, f2), f3) => (f1, f2, f3) } val (rawDF, rawF1, rawF2, rawF3) = TestFeatureBuilder("f1", "f2", "f3", generatedData) @@ -359,6 +366,19 @@ object OPMapVectorizerTestHelper extends Matchers { log.info("transformed data:") transformed.show(10) } + val isCategoricalArray = if (isCategorical) { + Array.fill(transformed.collect(featureVector).head.value.size)(true) + } else { + rawF1 match { + case f if f.isSubtypeOf[Date] => Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten + .asInstanceOf[Array[Boolean]] + case f if f.isSubtypeOf[Geolocation] => Array.fill(transformed.collect(featureVector).head.value.size / 4)( + Seq(false, false, false, true)).flatten + case _ => Array.fill(transformed.collect(featureVector).head.value.size / 2)(Seq(false, true)).flatten + } + } + val field = transformed.schema(featureVector.name) + AttributeTestUtils.assertNominal(field, isCategoricalArray) val summary = transformed.schema(featureVector.name).metadata log.info("summary:\n{}", summary) @@ -374,6 +394,9 @@ object OPMapVectorizerTestHelper extends Matchers { log.info("transformedMap:") transformedMap.show(10) } + val fieldMap = transformedMap.schema(mapFeatureVector.name) + AttributeTestUtils.assertNominal(fieldMap, isCategoricalArray) + // Check that the actual features are the same val vectorizedBaseFeatures = transformed.collect(featureVector) @@ -459,6 +482,7 @@ object OPMapVectorizerTestHelper extends Matchers { case _ => Map.empty } } + val mapData = asMap(f1, "f1") ++ asMap(f2, "f2") ++ asMap(f3, "f3") ftFactory.newInstance(mapData) } diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 5cc0f5a1e4..b4b32ac5e9 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -31,7 +31,7 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory -import com.salesforce.op.features.types.{Binary, BinaryMap, Text, TextArea, TextAreaMap, TextMap} +import com.salesforce.op.features.types.{Binary, BinaryMap, MultiPickList, MultiPickListMap, Text, TextArea, TextAreaMap, TextList, TextMap} import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} @@ -75,7 +75,9 @@ class OpVectorMetadata private newColumns: Array[OpVectorColumnMetadata] ): OpVectorMetadata = OpVectorMetadata(name, newColumns, history) - val textTypes = Seq(Text, TextArea, TextAreaMap, TextMap, Binary, BinaryMap).map(_.getClass.getName.dropRight(1)) + val textTypes = Seq(MultiPickList, MultiPickListMap, Text, TextArea, TextAreaMap, TextMap, Binary, BinaryMap, + TextList).map(_.getClass.getName.dropRight(1)) + /** * Serialize to spark metadata * From 96427dee24aa0cea2745d62a75c993bf073b8464 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Thu, 6 Sep 2018 18:01:11 -0700 Subject: [PATCH 08/15] Up to SmartTextVectorizer --- .../impl/feature/OPSetVectorizerTest.scala | 29 ++++++++++++++- .../impl/feature/RealMapVectorizerTest.scala | 31 ++++++++++------ .../impl/feature/RealVectorizerTest.scala | 21 +++++++---- .../feature/SmartTextMapVectorizerTest.scala | 35 +++++++++++++++---- .../feature/SmartTextVectorizerTest.scala | 27 +++++++++++--- 5 files changed, 113 insertions(+), 30 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala index 2799c2b5ff..9bad0c1eea 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPSetVectorizerTest.scala @@ -117,6 +117,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(1, 5, 6), Array(1.0, 1.0, 1.0)), Vectors.sparse(10, Array(0, 1, 7), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -131,6 +133,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(4, 7, 8), Array(1.0, 1.0, 1.0)), Vectors.sparse(13, Array(1, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -152,6 +156,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val fitted = vectorizer.setCleanText(true).setTopK(1).fit(dataSet) val transformed = fitted.transform(dataSet) val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) transformed.collect(vector) shouldBe expectedData vectorizer.setTopK(10) } @@ -166,6 +172,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 2.0, 0.0), Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) } it should "return a vector with elements only in the other & null columns and not throw errors when passed data" + @@ -178,6 +186,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 1.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -210,6 +220,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -233,6 +245,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val vector = fitted.getOutput() val transformed = fitted.transform(dataSetAllEmpty) val expected = Array(Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0)).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -249,7 +263,10 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { .asInstanceOf[Transformer].transform(dataSet) result.originStage shouldBe a[OpSetVectorizer[_]] - df.collect(result) shouldBe expectedData + val actual = df.collect(result) + actual shouldBe expectedData + val field = df.schema(result.name) + AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) } it should "expand number of columns for picklist features by two (one for other & one for null)" in { @@ -287,6 +304,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val fitted = localVectorizer.fit(localDataSet) val transformed = fitted.transform(localDataSet) val vector = localVectorizer.getOutput() + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) } it should "process multiple columns of PickList using the vectorize shortcut" in { @@ -306,6 +325,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) + val field = transformed.schema(vectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) @@ -328,6 +349,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) + val field = transformed.schema(vectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) @@ -351,6 +374,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val res = f2.transformWith[OPVector](stage = oPSetVectorizer.setTopK(3), Array.empty[FeatureLike[MultiPickList]]) val transformed = new OpWorkflow().setResultFeatures(res).transform(localDF) + val field = transformed.schema(res.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(res).head.value.size)(true)) } it should "process multiple columns of numerics, PickLists, and MultiPickLists using the vectorize shortcut" in { @@ -371,6 +396,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) + val field = transformed.schema(vectorized.name) + AttributeTestUtils.assertNominal(field, Array(true, true, true, true, false, true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala index f334370509..03e5a9d1f2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala @@ -92,7 +92,8 @@ class RealMapVectorizerTest val vectorizer = estimator.setDefaultValue(0.0).setTrackNulls(false).fit(inputData) val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -108,7 +109,8 @@ class RealMapVectorizerTest Vectors.sparse(12, Array(1, 3, 4, 8, 11), Array(1.0, 1.0, 11.0, 3.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -124,7 +126,8 @@ class RealMapVectorizerTest Vectors.dense(Array(100.0, 100.0, 11.0, 0.0, 3.0, 100.0)), Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -141,7 +144,8 @@ class RealMapVectorizerTest Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 11.0, 0.0, 0.0, 0.0, 3.0, 0.0, 100.0, 1.0)), Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -162,7 +166,8 @@ class RealMapVectorizerTest vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B")), m2 -> List(IndColWithGroup(None, "Z"))) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -186,7 +191,8 @@ class RealMapVectorizerTest IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B")), m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -208,8 +214,8 @@ class RealMapVectorizerTest m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y")) ) - - transformed.collect(vector) shouldBe expected + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta @@ -233,7 +239,8 @@ class RealMapVectorizerTest m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y")) ) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -251,7 +258,8 @@ class RealMapVectorizerTest Vectors.dense(-1.0, 4.0, 11.0, 0.0, 1.0, 5.0), Vectors.dense(-1.0, 4.0, 11.0, 0.0, 8.0 / 3, 15.0 / 2) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expected val expectedMeta = TestOpVectorMetadataBuilder( @@ -275,7 +283,8 @@ class RealMapVectorizerTest Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 0.0, 1.0, 0.0, 5.0, 0.0), Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 1.0, 8.0 / 3, 1.0, 15.0 / 2, 1.0) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala index 02a2b123aa..9dcc653c9a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala @@ -106,7 +106,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(4.2, 2.0, 4.2)), (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -132,7 +133,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(3.0, 2.0, 0.0)), (2.0, null, null, Vectors.dense(2.0, 2.0, 0.0)) ) - + val field = testDataTransformedMean.schema(testModelMean.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMean.head._4.size)(false)) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) @@ -154,7 +156,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(0.0, 1.0, 2.0, 0.0, 0.0, 1.0)), (2.0, null, null, Vectors.dense(2.0, 0.0, 0.0, 1.0, 0.0, 1.0)) ) - + val field = testDataTransformedConstantTracked.schema(testModelConstantTracked.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedZeroTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesZeroTracked.map(_.get(0)) shouldEqual expectedZeroTracked.map(_._1) transformedValuesZeroTracked.map(_.get(1)) shouldEqual expectedZeroTracked.map(_._2) transformedValuesZeroTracked.map(_.get(2)) shouldEqual expectedZeroTracked.map(_._3) @@ -187,7 +190,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(3.0, 1.0, 2.0, 0.0, 0.0, 1.0)), (2.0, null, null, Vectors.dense(2.0, 0.0, 2.0, 1.0, 0.0, 1.0)) ) - + val field = testDataTransformedMeanTracked.schema(testModelMeanTracked.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesMeanTracked.map(_.get(0)) shouldEqual expectedMeanTracked.map(_._1) transformedValuesMeanTracked.map(_.get(1)) shouldEqual expectedMeanTracked.map(_._2) transformedValuesMeanTracked.map(_.get(2)) shouldEqual expectedMeanTracked.map(_._3) @@ -220,7 +224,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(3.0, 1.0, 2.0, 0.0, 0.0, 1.0)), (2.0, null, null, Vectors.dense(2.0, 0.0, 2.0, 1.0, 0.0, 1.0)) ) - + val field = testDataTransformedMeanTracked.schema(testModelMeanTracked.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesMeanTracked.map(_.get(0)) shouldEqual expectedMeanTracked.map(_._1) transformedValuesMeanTracked.map(_.get(1)) shouldEqual expectedMeanTracked.map(_._2) transformedValuesMeanTracked.map(_.get(2)) shouldEqual expectedMeanTracked.map(_._3) @@ -252,7 +257,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(4.2, 2.0, 4.2)), (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -273,7 +279,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(4.2, 2.0, 4.2)), (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 1b48336def..e290aca79f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -105,7 +105,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -134,7 +137,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -164,7 +170,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -195,7 +204,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -229,7 +241,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -264,7 +279,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(data) val result = transformed.collect(smartMapVectorized, shortcutMapVectorized) - + val field = transformed.schema(shortcutMapVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(shortcutMapVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val smartMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val shortcutMeta = OpVectorMetadata(transformed.schema(shortcutMapVectorized.name)) smartMeta.history.keys shouldBe shortcutMeta.history.keys @@ -293,7 +311,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(data) val result = transformed.collect(textMapVectorized, textAreaMapVectorized) - + val field = transformed.schema(textMapVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(textMapVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(textAreaMapVectorized.name) + AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(textAreaMapVectorized).head.value.size)(true)) val textMapMeta = OpVectorMetadata(transformed.schema(textMapVectorized.name)) val textareaMapMeta = OpVectorMetadata(transformed.schema(textAreaMapVectorized.name)) textMapMeta.history.keys shouldBe textareaMapMeta.history.keys diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala index 34ef01af0b..eaa85d775d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala @@ -82,7 +82,14 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow() .setResultFeatures(smartVectorized, categoricalVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized, textVectorized, nullIndicator) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldCategorical = transformed.schema(categoricalVectorized.name) + AttributeTestUtils.assertNominal(fieldCategorical, + Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) + val fieldText = transformed.schema(textVectorized.name) + AttributeTestUtils.assertNominal(fieldText, + Array.fill(transformed.collect(textVectorized).head.value.size)(true)) val (smart, expected) = result.map { case (smartVector, categoricalVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(categoricalVector, textVector, nullVector)) smartVector -> combined @@ -101,7 +108,11 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, categoricalVectorized).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldCategorical = transformed.schema(categoricalVectorized.name) + AttributeTestUtils.assertNominal(fieldCategorical, + Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) val (smart, expected) = result.unzip smart shouldBe expected @@ -121,7 +132,11 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow() .setResultFeatures(smartVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, textVectorized, nullIndicator) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldText = transformed.schema(textVectorized.name) + AttributeTestUtils.assertNominal(fieldText, + Array.fill(transformed.collect(textVectorized).head.value.size)(true)) val (smart, expected) = result.map { case (smartVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(textVector, nullVector)) smartVector -> combined @@ -144,7 +159,11 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, shortcutVectorized).transform(inputData) val result = transformed.collect(smartVectorized, shortcutVectorized) - + val field = transformed.schema(smartVectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldShortcut = transformed.schema(shortcutVectorized.name) + AttributeTestUtils.assertNominal(fieldShortcut, + Array.fill(transformed.collect(shortcutVectorized).head.value.size)(true)) val (regular, shortcut) = result.unzip regular shouldBe shortcut From d52131ab76ff8658f4205c710a1c4673f10d3c13 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Fri, 7 Sep 2018 10:52:23 -0700 Subject: [PATCH 09/15] Up to URLVectorizerTests --- .../feature/TextListNullTransformerTest.scala | 2 ++ .../feature/TextMapNullEstimatorTest.scala | 3 +- .../impl/feature/TextMapVectorizerTest.scala | 36 +++++++++++++++++-- .../impl/feature/TextTransmogrifyTest.scala | 11 ++++-- .../impl/feature/TextVectorizerTest.scala | 6 ++-- .../impl/feature/TransmogrifyTest.scala | 3 +- .../impl/feature/URLVectorizerTest.scala | 9 +++-- 7 files changed, 59 insertions(+), 11 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala index 0c69e74888..a765cd860f 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala @@ -83,6 +83,8 @@ class TextListNullTransformerTest extends FlatSpec with TestSparkContext { Array(0.0, 1.0), Array(1.0, 1.0) ).map(Vectors.dense(_).toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = vectorizer.getMetadata() diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala index 913df46e9d..732c0da54d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala @@ -74,7 +74,8 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { Array(1.0, 1.0, 0.0, 1.0) ).map(Vectors.dense(_).toOPVector) transformed.collect(vector) shouldBe expected - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder( vectorizer, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala index 4137a2c0cd..56b85ed914 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala @@ -118,7 +118,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 7, 9), Array(1.0, 1.0, 1.0)), Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -134,7 +135,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(20, Array(0, 6, 9, 10, 13, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(20, Array(0, 3, 9, 12, 15, 16), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -150,6 +152,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(17, Array(0, 9, 11), Array(1.0, 1.0, 1.0)), Vectors.sparse(17, Array(1, 3, 14), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -178,6 +182,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(23, Array(0, 7, 10, 12, 15, 22), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(23, Array(1, 4, 10, 14, 18, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -208,6 +214,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(12, Array(0, 6, 8), Array(1.0, 1.0, 1.0)), Vectors.sparse(12, Array(0, 2, 10), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -222,6 +230,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(18, Array(0, 5, 8, 9, 12, 17), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(18, Array(0, 3, 8, 11, 14, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -234,6 +244,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 5, 7), Array(1.0, 1.0, 1.0)), Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -246,6 +258,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(16, Array(0, 5, 7, 8, 11, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(16, Array(0, 3, 7, 10, 13, 14), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -259,6 +273,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected val transformed2 = fitted.transform(dataSet) @@ -268,6 +284,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expected.head.value.size)(true)) transformed2.collect(fitted.getOutput()) shouldBe expected2 } @@ -281,6 +299,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected val transformed2 = fitted.transform(dataSet) @@ -290,6 +310,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + AttributeTestUtils.assertNominal(field2, Array.fill(expected.head.value.size)(true)) transformed2.collect(fitted.getOutput()) shouldBe expected2 } @@ -302,6 +324,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(Array.empty[Double]), Vectors.dense(Array.empty[Double]) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -316,6 +340,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(5, Array(3), Array(1.0)), Vectors.sparse(5, Array(0), Array(1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -330,6 +356,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(7, Array(3, 4), Array(1.0, 1.0)), Vectors.sparse(7, Array(0, 6), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -345,6 +373,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(9, Array(0, 7), Array(1.0, 1.0)), Vectors.sparse(9, Array(0, 4), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -360,6 +390,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(0, 5, 9, 10), Array(1.0, 1.0, 1.0, 1.0)), Vectors.sparse(13, Array(0, 5, 6, 12), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala index 78ca407f2d..d82cdb4333 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala @@ -96,7 +96,8 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature = Seq(largeText, largeTextarea).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(feature).transform(largeDS) val vectCollect = vectorized.collect(feature) - + val field = vectorized.schema(feature.name) + AttributeTestUtils.assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) for {vector <- vectCollect} { vector.v.size shouldBe TransmogrifierDefaults.DefaultNumOfFeatures * 2 + 2 } @@ -109,7 +110,10 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature2 = phone.vectorize("US") val vectorized = new OpWorkflow().setResultFeatures(feature, feature2).transform(ds) val vectCollect = vectorized.collect(feature, feature2) - + val field = vectorized.schema(feature.name) + AttributeTestUtils.assertNominal(field, Array.fill(vectCollect.head._1.value.size)(true)) + val field2 = vectorized.schema(feature2.name) + AttributeTestUtils.assertNominal(field2, Array.fill(vectCollect.head._2.value.size)(true)) for {(vector1, vector2) <- vectCollect} { vector1.v.size shouldBe 2 vector1.v.toArray should contain theSameElementsAs vector2.v.toArray @@ -122,7 +126,8 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature = Seq(text).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(feature).transform(ds) val vectCollect = vectorized.collect(feature) - + val field = vectorized.schema(feature.name) + AttributeTestUtils.assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) vectCollect.forall(_.value.size == TransmogrifierDefaults.DefaultNumOfFeatures + 1) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala index 500a7859a6..fb17f6afc7 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala @@ -63,7 +63,8 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(data) val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) - + val field = transformed.schema(vectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(result.head.value.size)(true)) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) should be >= 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "question")) should be >= 1.0 @@ -86,7 +87,8 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(data) val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) - + val field = transformed.schema(vectorized.name) + AttributeTestUtils.assertNominal(field, Array.fill(result.head.value.size)(true)) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala index 202be7f5c7..d353d43bab 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala @@ -68,7 +68,6 @@ class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val h = f.history() h.originFeatures.map(o => o -> FeatureHistory(Seq(o), h.stages)) }.toMap - transformed.schema.toOpVectorMetadata(feature.name) shouldEqual TestOpVectorMetadataBuilder.withOpNamesAndHist( feature.originStage, @@ -92,6 +91,8 @@ class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { List(1.0, 0.0, 363.0, 0.0, 172.0, 0.0), List(1.0, 0.0, 186.0, 0.0, 96.0, 0.0) ) + val field = transformed.schema(feature.name) + AttributeTestUtils.assertNominal(field, Array(true, true, false, true, false, true)) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala index 666929eb00..caefd15782 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala @@ -79,8 +79,13 @@ class URLVectorizerTest Vectors.dense(1.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) - def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = - new OpWorkflow().setResultFeatures(feature).transform(ds).collect(feature) + def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] ={ + val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) + val results = transformed.collect(feature) + val field = transformed.schema(feature.name) + AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + results + } Spec[RichURLMapFeature] should "vectorize UrlMaps correctly" in { val (ds1, f1) = TestFeatureBuilder(urls.map(e => Map(urlKey -> e.value.get).toURLMap)) From e9e692c55859adaf32d626ec2e0065e7d2000f40 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Fri, 7 Sep 2018 12:22:02 -0700 Subject: [PATCH 10/15] fix scalastyle --- .../impl/feature/AttributeTestUtils.scala | 38 ++++++++++++- .../impl/feature/EmailVectorizerTest.scala | 2 +- .../impl/feature/NumericBucketizerTest.scala | 12 ++-- .../feature/SmartTextMapVectorizerTest.scala | 55 +++++++++++-------- .../impl/feature/URLVectorizerTest.scala | 2 +- 5 files changed, 76 insertions(+), 33 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala index 7d4886a137..9c7fa0dd03 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala @@ -1,3 +1,33 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + package com.salesforce.op.stages.impl.feature import org.apache.spark.ml.attribute.AttributeGroup @@ -6,10 +36,14 @@ import org.junit.runner.RunWith import org.scalatest.Matchers import org.scalatest.junit.JUnitRunner -@RunWith(classOf[JUnitRunner]) object AttributeTestUtils extends Matchers{ - final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]) = { + /** + * Assert if attributes are nominal or not + * @param schema + * @param expectedNominal Expected array of booleans. True if the field is nominal, false if not. + */ + final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Unit = { val attributes = AttributeGroup.fromStructField(schema).attributes.get attributes.map(_.isNominal) shouldBe expectedNominal } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala index 2bcb98880d..c504c65a4a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala @@ -83,7 +83,7 @@ class EmailVectorizerTest def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = { val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) val field = transformed.schema(feature.name) - val collected = transformed.collect(feature) + val collected = transformed.collect(feature) AttributeTestUtils.assertNominal(field, Array.fill(collected.head.value.size)(true)) collected } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala index 4370c95e2b..e7f65bfc11 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala @@ -262,7 +262,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { ) val bucketizer = buck.originStage val transformed = bucketizer.asInstanceOf[NumericBucketizer[_]].transform(ds) - val results = transformed.collect(buck) + val results = transformed.collect(buck) val field = transformed.schema(buck.name) AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) @@ -286,7 +286,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val buck = num.bucketize(trackNulls = true, trackInvalid = true, splits = Array(0.0, 1.0, 5.0)) val stage = buck.originStage.asInstanceOf[NumericBucketizer[_]] val transformed = stage.transform(ds) - val results = transformed.collect(buck) + val results = transformed.collect(buck) results shouldBe Seq( Vectors.dense(0.0, 0.0, 0.0, 1.0), @@ -310,7 +310,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly (integrals)" in new IntegralTest { val vector = integralBucketizer.getOutput() val transformed = integralBucketizer.transform(data1) - val results = transformed.collect(vector) + val results = transformed.collect(vector) results shouldBe expectedAns val field = transformed.schema(vector.name) @@ -327,7 +327,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { vector.originStage shouldBe a[NumericBucketizer[_]] val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] val transformed = buck.transform(data1) - val results = transformed.collect(vector) + val results = transformed.collect(vector) results shouldBe expectedAns val field = transformed.schema(vector.name) AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) @@ -336,7 +336,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "keep track of null values if wanted (integrals)" in new IntegralTest { val vector = trackNullsIntegralBucketizer.getOutput() val transformed = trackNullsIntegralBucketizer.transform(data1) - val results = transformed.collect(vector) + val results = transformed.collect(vector) results shouldBe trackNullsExpectedAns val field = transformed.schema(vector.name) AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) @@ -356,7 +356,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val vector = num.bucketize(trackNulls = false, splits = splitsRightInclusive, splitInclusion = Inclusion.Right) val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] val transformed = buck.transform(data1) - val results = transformed.collect(vector) + val results = transformed.collect(vector) results shouldBe expectedRightInclusiveAns val field = transformed.schema(vector.name) AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index e290aca79f..39fd9c1ab3 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -108,13 +108,14 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val field = transformed.schema(smartVectorized.name) AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index < 4) m.grouping shouldBe f.grouping @@ -122,7 +123,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "detect two categorical text features" in { @@ -140,20 +141,21 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val field = transformed.schema(smartVectorized.name) AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) m.grouping shouldBe f.grouping m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "use separate hash space for each text feature" in { @@ -173,13 +175,14 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val field = transformed.schema(smartVectorized.name) AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index < 4 || m.index == 8) m.grouping shouldBe Option(f1.name) @@ -187,7 +190,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "use shared hash space for two text features" in { @@ -207,13 +210,14 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val field = transformed.schema(smartVectorized.name) AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index == 4) { @@ -224,7 +228,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "use shared hash space for two text features again" in { @@ -244,13 +248,14 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val field = transformed.schema(smartVectorized.name) AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index == TransmogrifierDefaults.MaxNumOfFeatures) { @@ -261,7 +266,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "product the same result for shortcut" in { @@ -280,22 +285,24 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(data) val result = transformed.collect(smartMapVectorized, shortcutMapVectorized) val field = transformed.schema(shortcutMapVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(shortcutMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(field, + Array.fill(transformed.collect(shortcutMapVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val smartMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val shortcutMeta = OpVectorMetadata(transformed.schema(shortcutMapVectorized.name)) smartMeta.history.keys shouldBe shortcutMeta.history.keys smartMeta.columns.length shouldBe shortcutMeta.columns.length - smartMeta.columns.zip(shortcutMeta.columns).foreach{ case (smart, shortcut) => + smartMeta.columns.zip(shortcutMeta.columns).foreach { case (smart, shortcut) => smart.parentFeatureName shouldBe shortcut.parentFeatureName smart.parentFeatureType shouldBe shortcut.parentFeatureType smart.grouping shouldBe shortcut.grouping smart.indicatorValue shouldBe shortcut.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "work on textarea map fields" in { @@ -312,15 +319,17 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(data) val result = transformed.collect(textMapVectorized, textAreaMapVectorized) val field = transformed.schema(textMapVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(textMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(field, + Array.fill(transformed.collect(textMapVectorized).head.value.size)(true)) val fieldMap = transformed.schema(textAreaMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, Array.fill(transformed.collect(textAreaMapVectorized).head.value.size)(true)) + AttributeTestUtils.assertNominal(fieldMap, + Array.fill(transformed.collect(textAreaMapVectorized).head.value.size)(true)) val textMapMeta = OpVectorMetadata(transformed.schema(textMapVectorized.name)) val textareaMapMeta = OpVectorMetadata(transformed.schema(textAreaMapVectorized.name)) textMapMeta.history.keys shouldBe textareaMapMeta.history.keys textMapMeta.columns.length shouldBe textareaMapMeta.columns.length - textMapMeta.columns.zip(textareaMapMeta.columns).foreach{ case (textMap, textareaMap) => + textMapMeta.columns.zip(textareaMapMeta.columns).foreach { case (textMap, textareaMap) => textMap.parentFeatureName shouldBe textareaMap.parentFeatureName textMap.parentFeatureType shouldBe Array("com.salesforce.op.features.types.TextMap") textareaMap.parentFeatureType shouldBe Array("com.salesforce.op.features.types.TextAreaMap") @@ -328,6 +337,6 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { textMap.indicatorValue shouldBe textareaMap.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala index caefd15782..3d247dbcd1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala @@ -79,7 +79,7 @@ class URLVectorizerTest Vectors.dense(1.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) - def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] ={ + def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = { val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) val results = transformed.collect(feature) val field = transformed.schema(feature.name) From 95d43cc57588f811d10ff962fa7fa17ea8e294cb Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Fri, 7 Sep 2018 16:10:46 -0700 Subject: [PATCH 11/15] with AttributeAsserts --- ...TestUtils.scala => AttributeAsserts.scala} | 10 +- .../impl/feature/Base64VectorizerTest.scala | 4 +- .../feature/BinaryMapVectorizerTest.scala | 7 +- .../impl/feature/BinaryVectorizerTest.scala | 10 +- .../impl/feature/DateListVectorizerTest.scala | 16 +-- .../DateMapToUnitCircleVectorizerTest.scala | 6 +- .../impl/feature/DateMapVectorizerTest.scala | 8 +- .../impl/feature/DateTimeVectorizerTest.scala | 8 +- .../DateToUnitCircleTransformerTest.scala | 33 +++--- .../impl/feature/DateVectorizerTest.scala | 8 +- .../DecisionTreeNumericBucketizerTest.scala | 4 +- ...DecisionTreeNumericMapBucketizerTest.scala | 4 +- .../DropIndicesByTransformerTest.scala | 9 +- .../impl/feature/EmailVectorizerTest.scala | 5 +- .../GeolocationMapVectorizerTest.scala | 21 ++-- .../feature/GeolocationVectorizerTest.scala | 8 +- .../feature/IntegralMapVectorizerTest.scala | 19 ++-- .../impl/feature/IntegralVectorizerTest.scala | 16 +-- .../MultiPickListMapVectorizerTest.scala | 44 ++++---- .../impl/feature/NumericBucketizerTest.scala | 26 ++--- .../impl/feature/NumericVectorizerTest.scala | 14 +-- .../OPCollectionHashingVectorizerTest.scala | 22 ++-- .../impl/feature/OPMapVectorizerTest.scala | 11 +- .../impl/feature/OpSetVectorizerTest.scala | 28 ++--- .../impl/feature/RealMapVectorizerTest.scala | 23 ++-- .../impl/feature/RealVectorizerTest.scala | 16 +-- .../feature/SmartTextMapVectorizerTest.scala | 30 ++--- .../feature/SmartTextVectorizerTest.scala | 20 ++-- .../feature/TextListNullTransformerTest.scala | 4 +- .../feature/TextMapNullEstimatorTest.scala | 4 +- .../impl/feature/TextMapVectorizerTest.scala | 104 +++++++++--------- .../impl/feature/TextTransmogrifyTest.scala | 12 +- .../impl/feature/TextVectorizerTest.scala | 6 +- .../impl/feature/TransmogrifyTest.scala | 6 +- .../impl/feature/URLVectorizerTest.scala | 5 +- 35 files changed, 287 insertions(+), 284 deletions(-) rename core/src/test/scala/com/salesforce/op/stages/impl/feature/{AttributeTestUtils.scala => AttributeAsserts.scala} (92%) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala similarity index 92% rename from core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala rename to core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala index 9c7fa0dd03..19c1265f0c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeTestUtils.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala @@ -32,18 +32,16 @@ package com.salesforce.op.stages.impl.feature import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.sql.types.StructField -import org.junit.runner.RunWith -import org.scalatest.Matchers -import org.scalatest.junit.JUnitRunner - -object AttributeTestUtils extends Matchers{ +import org.scalatest.{Assertion, Matchers} +trait AttributeAsserts { + self: Matchers => /** * Assert if attributes are nominal or not * @param schema * @param expectedNominal Expected array of booleans. True if the field is nominal, false if not. */ - final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Unit = { + final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Assertion = { val attributes = AttributeGroup.fromStructField(schema).attributes.get attributes.map(_.isNominal) shouldBe expectedNominal } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala index d2ca4864f4..124469bc42 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala @@ -41,7 +41,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64TestData { +class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64TestData with AttributeAsserts { "Base64Vectorizer" should "vectorize random binary data" in { val vec = randomBase64.vectorize(topK = 10, minSupport = 0, cleanText = true, trackNulls = false) @@ -64,7 +64,7 @@ class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64Tes val result = new OpWorkflow().setResultFeatures(vec).transform(realData) val vectors = result.collect(vec) val schema = result.schema(vec.name) - AttributeTestUtils.assertNominal(schema, Array.fill(vectors.head.value.size)(true)) + assertNominal(schema, Array.fill(vectors.head.value.size)(true)) vectors.length shouldBe expected.length // TODO add a more robust check diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala index 8d2900897f..fe007e1ab9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala @@ -43,7 +43,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class BinaryMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] { + extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -73,7 +74,7 @@ class BinaryMapVectorizerTest transformed.collect(vector) shouldBe expectedResult val field = transformed.schema(estimator.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(true)) + assertNominal(field, Array.fill(expectedResult.head.value.size)(true)) OpVectorMetadata(field) shouldEqual expectedMeta val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta @@ -101,7 +102,7 @@ class BinaryMapVectorizerTest transformed.collect(vector) shouldBe expected val field = transformed.schema(estimator.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) OpVectorMetadata(field) shouldEqual expectedMeta val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala index 40a8299ff2..9acdfe578c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala @@ -41,7 +41,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] { +class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] with AttributeAsserts { val (inputData, f1, f2) = TestFeatureBuilder( Seq[(Binary, Binary)]( @@ -94,7 +94,7 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=true,fillValue=true]" in { @@ -120,7 +120,7 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=false,fillValue=false]" in { @@ -146,7 +146,7 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f2 -> List(RootCol) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=false,fillValue=true]" in { @@ -172,6 +172,6 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f2 -> List(RootCol) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala index 3e4dc909e7..0915fb2b9d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala @@ -45,7 +45,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] { +class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] with AttributeAsserts { // Sunday July 12th 1998 at 22:45 val defaultDate = new DateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis @@ -127,7 +127,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) + assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -155,7 +155,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size) + assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size) (Seq(false, true)).flatten) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -184,7 +184,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) + assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -208,7 +208,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true)) + assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday").map(s => @@ -239,7 +239,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true)) + assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", @@ -269,7 +269,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true)) + assertNominal(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true)) testModelModeMonth.getMetadata() shouldEqual fieldMetadata val months = List( @@ -301,7 +301,7 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori val schema = transformed.schema(output.name) val fieldMetadata = schema.metadata - AttributeTestUtils.assertNominal(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true)) + assertNominal(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true)) testModelModeHour.getMetadata() shouldEqual fieldMetadata val hours = (0 until 24).map(i => IndCol(Some(s"$i:00"))).toList diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala index a1ac9081f7..675f6cf181 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, SequenceModel[DateMap, OPVector], - DateMapToUnitCircleVectorizer[DateMap]] { + DateMapToUnitCircleVectorizer[DateMap]] with AttributeAsserts { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( @@ -79,7 +79,7 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen .fit(inputData).transform(inputData) val field = transformed.schema(output.name) val actual = transformed.collect(output) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + assertNominal(field, Array.fill(actual.head.value.size)(false)) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } @@ -92,7 +92,7 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen .fit(inputData).transform(inputData) val field = transformed.schema(output.name) val actual = transformed.collect(output) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + assertNominal(field, Array.fill(actual.head.value.size)(false)) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala index 87eda4639f..8067f560f8 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateMapVectorizerTest extends FlatSpec with TestSparkContext { +class DateMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { // Sunday July 12th 1998 at 22:45 private val defaultDate = new JDateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis @@ -72,7 +72,7 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { meta.columns.length shouldBe 3 meta.columns.map(_.grouping) should contain theSameElementsAs Array(Option("a"), Option("b"), Option("c")) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) + assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) val vector2 = f1.vectorize(defaultValue = 0, referenceDate = moment, trackNulls = true, circularDateReps = Seq()) @@ -83,7 +83,7 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 1 val field2 = transformed2.schema(vector2.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) + assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize(defaultValue = 0) val transformed3 = new OpWorkflow().setResultFeatures(vector3).transform(ds) @@ -94,7 +94,7 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { meta2.history.keys.size shouldBe 1 val field3 = transformed3.schema(vector3.name) val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] - AttributeTestUtils.assertNominal(field3, expectedNominal) + assertNominal(field3, expectedNominal) } private def expected(moment: JDateTime) = { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala index 762fd1ed2f..4f878f127e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala @@ -45,7 +45,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { +class DateTimeVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { // Sunday July 12th 1998 at 22:45 private val defaultDate = new JDateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis @@ -92,7 +92,7 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { meta.columns.length shouldBe 3 meta.history.keys.size shouldBe 3 val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) + assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) val vector2 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -108,7 +108,7 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 3 val field2 = transformed2.schema(vector2.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) + assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -123,7 +123,7 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { meta3.history.keys.size shouldBe 6 val field3 = transformed3.schema(vector3.name) val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] - AttributeTestUtils.assertNominal(field3, expectedNominal) + assertNominal(field3, expectedNominal) } it should "vectorize dates correctly any time" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala index 565decc3be..7c0dede857 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala @@ -43,7 +43,8 @@ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateToUnitCircleTransformer[Date]] { +class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateToUnitCircleTransformer[Date]] + with AttributeAsserts { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( @@ -68,7 +69,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val vector = vectorizer.getOutput() val actual = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + assertNominal(field, Array.fill(actual.head.value.size)(false)) actual } @@ -83,9 +84,9 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val output = dateFeature.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) - all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps val field = transformed.schema(output.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "work with its DateTime shortcut" in { @@ -94,9 +95,9 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val output = dateTimeFeature.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) - all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps val field = transformed.schema(output.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "store the proper meta data" in { @@ -105,7 +106,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val vectorizer = new DateToUnitCircleTransformer().setTimePeriod(HourOfDay).setInput(feature) val transformed = vectorizer.transform(ds) val meta = OpVectorMetadata(transformed.schema(vectorizer.getOutput().name)) - meta.columns.length should equal (2) + meta.columns.length should equal(2) meta.columns(0).descriptorValue shouldBe Some("x_HourOfDay") meta.columns(1).descriptorValue shouldBe Some("y_HourOfDay") } @@ -124,14 +125,14 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo Array(0.0, 0.0), Array(1.0, 0.0) ).map(Vectors.dense(_).toOPVector) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(false)) + assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "transform the data correctly when the timePeriod is HourOfDay" in { val actual = transformData(sampleDateTimes, HourOfDay) - all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfYear" in { @@ -145,7 +146,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val actual = transformData(dateTimes, DayOfYear) val sampleDaysOfYearMinusOne = Array(0, 1, 2, 3, 31) val expected = indexSeqToUnitCircle(sampleDaysOfYearMinusOne, 366) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfWeek" in { @@ -160,7 +161,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo ) val actual = transformData(dateTimes, DayOfWeek) val expectedDaysOfWeekMinusOne = indexSeqToUnitCircle(Seq(0, 1, 2, 3, 4, 5, 6), 7) - all (actual.zip(expectedDaysOfWeekMinusOne).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedDaysOfWeekMinusOne).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is WeekOfYear" in { @@ -174,14 +175,14 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val actual = transformData(dateTimes, WeekOfYear) val sampleWeeksOfYearMinusOne = Seq(51, 0, 1, 2, 3) val expected = indexSeqToUnitCircle(sampleWeeksOfYearMinusOne, 53) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfMonth" in { val actual = transformData(sampleDateTimes, DayOfMonth) val sampleDaysOfMonthMinusOne = Seq(10, 27, 16, 16, 12) val expected = indexSeqToUnitCircle(sampleDaysOfMonthMinusOne, 31) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is MonthOfYear" in { @@ -195,7 +196,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val actual = transformData(dateTimes, MonthOfYear) val sampleMonthsOfYearMinusOne = Seq(0, 1, 2, 3, 11) val expected = indexSeqToUnitCircle(sampleMonthsOfYearMinusOne, 12) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is WeekOfMonth" in { @@ -208,6 +209,6 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo ) val actual = transformData(dateTimes, WeekOfMonth) val expected = indexSeqToUnitCircle(Seq(0, 1, 2, 3, 4), 6) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala index 730394a62c..a92069621b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateVectorizerTest extends FlatSpec with TestSparkContext { +class DateVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { require(DateTimeUtils.DefaultTimeZone == DateTimeZone.UTC) // Sunday July 12th 1998 at 22:45 @@ -70,7 +70,7 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { meta.columns.length shouldBe 3 meta.history.keys.size shouldBe 3 val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expectedAt(moment).head.value.size)(false)) + assertNominal(field, Array.fill(expectedAt(moment).head.value.size)(false)) val vector2 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -86,7 +86,7 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 3 val field2 = transformed2.schema(vector2.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expectedAt(moment).head.value.size)(Seq(false, true)).flatten) + assertNominal(field2, Array.fill(expectedAt(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -100,7 +100,7 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { meta3.history.keys.size shouldBe 6 val field3 = transformed3.schema(vector3.name) val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] - AttributeTestUtils.assertNominal(field3, expectedNominal) + assertNominal(field3, expectedNominal) } private def buildTestData(moment: DateTime) = { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala index a047277aaa..7a58d35825 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala @@ -48,7 +48,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, Real, OPVector], DecisionTreeNumericBucketizer[Double, Real]] - with DecisionTreeNumericBucketizerAsserts + with DecisionTreeNumericBucketizerAsserts with AttributeAsserts { val (inputData, estimator) = { val numericData = Seq(1.0.toReal, 18.0.toReal, Real.empty, (-1.23).toReal, 0.0.toReal) @@ -206,7 +206,7 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, val transformed = model.transform(data) val res = transformed.collect(out) val field = transformed.schema(out.name) - AttributeTestUtils.assertNominal(field, Array.fill(res.head.value.size)(true)) + assertNominal(field, Array.fill(res.head.value.size)(true)) assertMetadata( shouldSplit = Array(shouldSplit), splits = Array(splits), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala index 744546117c..aecb39fe6e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala @@ -47,7 +47,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, RealMap, OPVector], DecisionTreeNumericMapBucketizer[Double, RealMap]] - with DecisionTreeNumericBucketizerAsserts + with DecisionTreeNumericBucketizerAsserts with AttributeAsserts { import OPMapVectorizerTestHelper._ @@ -229,7 +229,7 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, val scored = model.setInputDataset(data).score(keepIntermediateFeatures = true) val res = scored.collect(out) val field = scored.schema(out.name) - AttributeTestUtils.assertNominal(field, Array.fill(res.head.value.size)(true)) + assertNominal(field, Array.fill(res.head.value.size)(true)) assertMetadata( shouldSplit = stage.shouldSplitByKey.toArray.sortBy(_._1).map(_._2), splits = stage.splitsByKey.toArray.sortBy(_._1).map(_._2), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala index d56c50cbdf..b2f9f9570d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala @@ -43,9 +43,8 @@ import org.scalatest.junit.JUnitRunner import org.apache.spark.sql.functions._ - @RunWith(classOf[JUnitRunner]) -class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndicesByTransformer] { +class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndicesByTransformer] with AttributeAsserts { val (inputData, transformer) = { val vecData = Seq( @@ -75,7 +74,7 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) val field = materializedFeatures.schema(prunedVector.name) val collectedFeatures = materializedFeatures.collect(prunedVector) - AttributeTestUtils.assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) + assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) collectedFeatures.foreach(_.value.size shouldBe 4) materializedFeatures.collect().foreach { r => @@ -94,10 +93,10 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) val field = materializedFeatures.schema(prunedVector.name) val collectedFeatures = materializedFeatures.collect(prunedVector) - AttributeTestUtils.assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) + assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) collectedFeatures.foreach(_.value.size shouldBe 4) - materializedFeatures.collect().foreach( _.getAs[Vector](2).toArray.max shouldBe 1) + materializedFeatures.collect().foreach(_.getAs[Vector](2).toArray.max shouldBe 1) val rawMeta = OpVectorMetadata(vectorizedPicklist.name, vectorizedPicklist.originStage.getMetadata()) val trimmedMeta = OpVectorMetadata(materializedFeatures.schema(prunedVector.name)) rawMeta.columns.length - 1 shouldBe trimmedMeta.columns.length diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala index c504c65a4a..a3f84f6a1a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala @@ -45,7 +45,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class EmailVectorizerTest - extends FlatSpec with FeatureTestBase with RichMapFeature with RichFeature with RichTextFeature { + extends FlatSpec with FeatureTestBase with RichMapFeature with RichFeature with RichTextFeature + with AttributeAsserts { val emailKey = "Email1" val emailKey2 = "Email2" val emails = (RandomText.emails("salesforce.com").take(2) ++ RandomText.emails("einstein.ai").take(2)).toSeq @@ -84,7 +85,7 @@ class EmailVectorizerTest val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) val field = transformed.schema(feature.name) val collected = transformed.collect(feature) - AttributeTestUtils.assertNominal(field, Array.fill(collected.head.value.size)(true)) + assertNominal(field, Array.fill(collected.head.value.size)(true)) collected } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala index 2072936cea..92531ee2d2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala @@ -44,7 +44,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class GeolocationMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[GeolocationMap, OPVector], GeolocationMapVectorizer] { + extends OpEstimatorSpec[OPVector, SequenceModel[GeolocationMap, OPVector], GeolocationMapVectorizer] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -94,7 +95,7 @@ class GeolocationMapVectorizerTest val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -114,7 +115,7 @@ class GeolocationMapVectorizerTest ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, + assertNominal(field, Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -133,7 +134,7 @@ class GeolocationMapVectorizerTest Array.fill(18)(6.0) ).map(v => Vectors.dense(v).toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -154,7 +155,7 @@ class GeolocationMapVectorizerTest (0 until 6).flatMap(k => Seq.fill(3)(6.0) :+ 1.0).toArray ).map(v => Vectors.dense(v).toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, + assertNominal(field, Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) transformed.collect(vector) shouldBe expected @@ -174,7 +175,7 @@ class GeolocationMapVectorizerTest Vectors.sparse(9, Array(), Array()) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, @@ -201,7 +202,7 @@ class GeolocationMapVectorizerTest Vectors.sparse(12, Array(3, 7, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, + assertNominal(field, Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( @@ -232,7 +233,7 @@ class GeolocationMapVectorizerTest Vectors.sparse(12, Array(), Array()) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, @@ -259,7 +260,7 @@ class GeolocationMapVectorizerTest Vectors.sparse(16, Array(3, 7, 11, 15), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, + assertNominal(field, Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( @@ -287,7 +288,7 @@ class GeolocationMapVectorizerTest val expectedOutput = transformed.collect() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size / 4) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size / 4) (Seq(false, false, false, true)).flatten) // Now using the shortcut val res = m1.vectorize(cleanKeys = TransmogrifierDefaults.CleanKeys, others = Array(m2)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala index ee0bc32fb2..03157b32f5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala @@ -43,7 +43,7 @@ import com.salesforce.op.utils.spark.RichDataset._ @RunWith(classOf[JUnitRunner]) -class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { +class GeolocationVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (testData, inA, inB, inC, inD) = TestFeatureBuilder("inA", "inB", "inC", "inD", Seq[(Geolocation, Geolocation, Geolocation, Geolocation)]( @@ -100,7 +100,7 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { ) val output = testModelConstant.getOutputFeatureName val field = testDataTransformedConstant.schema(output) - AttributeTestUtils.assertNominal( + assertNominal( field, Array.fill(expectedConstant.head._5.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedConstant.map(_._1) @@ -136,7 +136,7 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { ) val output = testModelMean.getOutputFeatureName val field = testDataTransformedMean.schema(output) - AttributeTestUtils.assertNominal( + assertNominal( field, Array.fill(expectedMean.head._5.size)(false)) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) @@ -179,7 +179,7 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { ) val output = testModelMean.getOutputFeatureName val field = testDataTransformedMean.schema(output) - AttributeTestUtils.assertNominal( + assertNominal( field, Array.fill(expectedMean.head._5.size / 4)(Seq(false, false, false, true)).flatten) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala index 613afa07ca..0dcbc0ac9e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala @@ -44,7 +44,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class IntegralMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[IntegralMap, OPVector], IntegralMapVectorizer[IntegralMap]] { + extends OpEstimatorSpec[OPVector, SequenceModel[IntegralMap, OPVector], IntegralMapVectorizer[IntegralMap]] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -84,7 +85,7 @@ class IntegralMapVectorizerTest val vector = estimator.getOutput() val transformed = model.transform(inputData) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(estimator.getOutputFeatureName) shouldEqual expectedMeta @@ -103,7 +104,7 @@ class IntegralMapVectorizerTest Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -122,7 +123,7 @@ class IntegralMapVectorizerTest Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -141,7 +142,7 @@ class IntegralMapVectorizerTest Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -162,7 +163,7 @@ class IntegralMapVectorizerTest Vectors.sparse(3, Array(), Array()) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B")), @@ -188,7 +189,7 @@ class IntegralMapVectorizerTest Vectors.sparse(6, Array(1, 3, 5), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(3)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(3)(Seq(false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), @@ -214,7 +215,7 @@ class IntegralMapVectorizerTest Vectors.sparse(4, Array(), Array()) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), @@ -239,7 +240,7 @@ class IntegralMapVectorizerTest Vectors.sparse(8, Array(1, 3, 5, 7), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(4)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(4)(Seq(false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala index d7946fb782..49cc689cc1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala @@ -42,7 +42,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class IntegralVectorizerTest extends FlatSpec with TestSparkContext { +class IntegralVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (testData, inA, inB, inC, inD) = TestFeatureBuilder("inA", "inB", "inC", "inD", Seq( @@ -114,7 +114,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(3.0, 3.0, 3.0, 3.0)) ) val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._5.size)(false)) + assertNominal(field, Array.fill(expectedZero.head._5.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -144,7 +144,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) transformedValuesMode.map(_.get(2)) shouldEqual expectedMode.map(_._3) @@ -171,7 +171,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)) ) val field = testDataTransformedConstantTracked.schema(testModelConstantTracked.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedZeroTracked.head._5.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expectedZeroTracked.head._5.size / 2)(Seq(false, true)).flatten) transformedValuesZeroTracked.map(_.get(0)) shouldEqual expectedZeroTracked.map(_._1) transformedValuesZeroTracked.map(_.get(1)) shouldEqual expectedZeroTracked.map(_._2) @@ -211,7 +211,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(4.0, 1.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0)) ) val field = testDataTransformedModeTracked.schema(testModelModeTracked.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedModeTracked.head._5.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expectedModeTracked.head._5.size / 2)(Seq(false, true)).flatten) transformedValuesModeTracked.map(_.get(0)) shouldEqual expectedModeTracked.map(_._1) transformedValuesModeTracked.map(_.get(1)) shouldEqual expectedModeTracked.map(_._2) @@ -255,7 +255,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) @@ -287,7 +287,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) @@ -319,7 +319,7 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala index 8d17100eaf..a26996e006 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala @@ -46,7 +46,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { +class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val log = LoggerFactory.getLogger(this.getClass) @@ -97,7 +97,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = fitted.transform(dataSet) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -122,7 +122,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = fitted.transform(dataSet) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -160,7 +160,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -178,7 +178,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(20, Array(0, 3, 9, 12, 15, 16), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -196,7 +196,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(17, Array(1, 3, 14), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -227,7 +227,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(23, Array(1, 4, 10, 14, 18, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -260,7 +260,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(12, Array(0, 2, 10), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -277,7 +277,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(18, Array(0, 3, 8, 11, 14, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -292,7 +292,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -307,7 +307,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(16, Array(0, 3, 7, 10, 13, 14), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -323,7 +323,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected @@ -335,7 +335,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) val field2 = transformed2.schema(vector.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expected2.head.value.size)(true)) + assertNominal(field2, Array.fill(expected2.head.value.size)(true)) transformed2.collect(vector) shouldBe expected2 } @@ -351,7 +351,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected @@ -363,7 +363,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) val field2 = transformed2.schema(vector.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expected2.head.value.size)(true)) + assertNominal(field2, Array.fill(expected2.head.value.size)(true)) transformed2.collect(vector) shouldBe expected2 } @@ -377,7 +377,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(Array.empty[Double]) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -396,7 +396,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(5, Array(0), Array(1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -415,7 +415,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(7, Array(0, 6), Array(1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -434,7 +434,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(9, Array(0, 4), Array(1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -453,7 +453,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(0, 5, 6, 12), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -474,7 +474,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(8, Array(0, 2, 3, 4, 5, 6), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -495,7 +495,7 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala index e7f65bfc11..a35ae9dae9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class NumericBucketizerTest extends FlatSpec with TestSparkContext { +class NumericBucketizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { trait GenericTest { val numbers = Seq(Some(10.0), None, Some(3.0), Some(5.0), Some(6.0), None, Some(1.0), Some(0.0)) @@ -171,7 +171,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed = realBucketizer.transform(data1) val actual = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + assertNominal(field, Array.fill(actual.head.value.size)(true)) actual shouldBe expectedAns val expectedMeta = TestOpVectorMetadataBuilder( @@ -190,7 +190,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed2 = realBucketizer2.transform(data1) val actual2 = transformed2.collect(vector2) val field2 = transformed2.schema(vector2.name) - AttributeTestUtils.assertNominal(field2, Array.fill(actual2.head.value.size)(true)) + assertNominal(field2, Array.fill(actual2.head.value.size)(true)) } it should "work as a shortcut (reals)" in new RealTest { @@ -200,7 +200,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed = buck.transform(data1) val actual = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + assertNominal(field, Array.fill(actual.head.value.size)(true)) actual shouldBe expectedAns } @@ -209,7 +209,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed = trackNullsRealBucketizer.transform(data1) val actual = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + assertNominal(field, Array.fill(actual.head.value.size)(true)) actual shouldBe trackNullsExpectedAns val expectedMeta = TestOpVectorMetadataBuilder( @@ -236,7 +236,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed2 = trackNullsRealBucketizer2.transform(data1) val actual2 = transformed2.collect(vector2) val field2 = transformed2.schema(vector2.name) - AttributeTestUtils.assertNominal(field2, Array.fill(actual2.head.value.size)(true)) + assertNominal(field2, Array.fill(actual2.head.value.size)(true)) } it should "allow right inclusive splits (reals)" in new RealTest { @@ -245,7 +245,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed = buck.transform(data1) val actual = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + assertNominal(field, Array.fill(actual.head.value.size)(true)) actual shouldBe expectedRightInclusiveAns } @@ -264,7 +264,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val transformed = bucketizer.asInstanceOf[NumericBucketizer[_]].transform(ds) val results = transformed.collect(buck) val field = transformed.schema(buck.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) bucketizer shouldBe a[NumericBucketizer[_]] @@ -297,7 +297,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { ).map(_.toOPVector) val field = transformed.schema(buck.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( stage, num -> List(IndCol(Some("[0.0-1.0)")), IndCol(Some("[1.0-5.0)")), @@ -314,7 +314,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { results shouldBe expectedAns val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( integralBucketizer, num -> List(IndCol(Some("0-1")), IndCol(Some("1-5")), IndCol(Some("5-10")), IndCol(Some("10-Infinity"))) @@ -330,7 +330,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val results = transformed.collect(vector) results shouldBe expectedAns val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) } it should "keep track of null values if wanted (integrals)" in new IntegralTest { @@ -339,7 +339,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val results = transformed.collect(vector) results shouldBe trackNullsExpectedAns val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( trackNullsIntegralBucketizer, @@ -359,7 +359,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val results = transformed.collect(vector) results shouldBe expectedRightInclusiveAns val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala index 8d1cbad6ca..dfe0852bdb 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala @@ -45,7 +45,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class NumericVectorizerTest extends FlatSpec with FeatureTestBase { +class NumericVectorizerTest extends FlatSpec with FeatureTestBase with AttributeAsserts { val ageData: Seq[Real] = RandomReal.uniform[Real](maxValue = 80.0).limit(100) val heightData: Seq[Real] = RandomReal.normal[Real](mean = 65.0, sigma = 8).limit(100) @@ -76,7 +76,7 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { Array(4.0, 0.0, 0.0, 1.0) ).map(Vectors.dense(_).toOPVector) val field = vectorized.schema(autoBucketFeature.name) - AttributeTestUtils.assertNominal(field, false +: Array.fill(expected.head.value.size - 1)(true)) + assertNominal(field, false +: Array.fill(expected.head.value.size - 1)(true)) vectorized.collect(autoBucketFeature) should contain theSameElementsAs expected } it should "vectorize single real feature with a label" in { @@ -89,7 +89,7 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { ).combine() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) val field = vectorized.schema(autoBucketFeature.name) - AttributeTestUtils.assertNominal(field, false +: + assertNominal(field, false +: Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 1)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray @@ -106,7 +106,7 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { ).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) val field = vectorized.schema(autoBucketFeature.name) - AttributeTestUtils.assertNominal(field, Array(false, true, false) ++ + assertNominal(field, Array(false, true, false) ++ Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 3)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray @@ -125,11 +125,11 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { val manualBucketFeature = Seq(count, count.autoBucketize(labelData, trackNulls = false)).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) val field = vectorized.schema(autoBucketFeature.name) - AttributeTestUtils.assertNominal(field, false +: + assertNominal(field, false +: Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 1)(true)) val field2 = vectorized.schema(manualBucketFeature.name) - AttributeTestUtils.assertNominal(field2, false +: - Array.fill(vectorized.collect(manualBucketFeature).head.value.size -1)(true)) + assertNominal(field2, false +: + Array.fill(vectorized.collect(manualBucketFeature).head.value.size - 1)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala index 99c72fb0ac..d1f5326de5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala @@ -41,7 +41,7 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { +class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (catData, top, bot) = TestFeatureBuilder("top", "bot", Seq[(MultiPickList, MultiPickList)]( @@ -105,7 +105,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -142,7 +142,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -181,7 +181,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -213,7 +213,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -240,7 +240,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe true @@ -275,7 +275,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe true @@ -356,7 +356,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe false @@ -388,7 +388,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) vectorizer.isSharedHashSpace shouldBe true @@ -418,7 +418,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(catData) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 20 @@ -435,7 +435,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val transformed = vectorizer.transform(catData) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 10 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala index 0d97888d2e..7e5790f724 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala @@ -52,7 +52,7 @@ import scala.reflect.runtime.universe._ @RunWith(classOf[JUnitRunner]) -class OPMapVectorizerTest extends FlatSpec with TestSparkContext { +class OPMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { import OPMapVectorizerTestHelper._ @@ -85,7 +85,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val output = vectorizer.getOutput() val field = transformed.schema(output.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(output) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -325,7 +325,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { } } -object OPMapVectorizerTestHelper extends Matchers { +object OPMapVectorizerTestHelper extends Matchers with AttributeAsserts { val log = LoggerFactory.getLogger(this.getClass) @@ -338,7 +338,6 @@ object OPMapVectorizerTestHelper extends Matchers { * @param f2Data Sequence of base feature type data (eg. from generators) * @param f3Data Sequence of base feature type data (eg. from generators) * @param isCategorical If the vector contains categoricals - * @param numberOfContinous It the vector contains categoricals * @tparam F Base feature type (eg. ID, Text, Integer) * @tparam FM OPMap feature type (eg. IDMap, TextMap, IntegerMap) * @tparam MT Value type of map inside OPMap feature (eg. String, String, Int) @@ -378,7 +377,7 @@ object OPMapVectorizerTestHelper extends Matchers { } } val field = transformed.schema(featureVector.name) - AttributeTestUtils.assertNominal(field, isCategoricalArray) + assertNominal(field, isCategoricalArray) val summary = transformed.schema(featureVector.name).metadata log.info("summary:\n{}", summary) @@ -395,7 +394,7 @@ object OPMapVectorizerTestHelper extends Matchers { transformedMap.show(10) } val fieldMap = transformedMap.schema(mapFeatureVector.name) - AttributeTestUtils.assertNominal(fieldMap, isCategoricalArray) + assertNominal(fieldMap, isCategoricalArray) // Check that the actual features are the same diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala index ab44da8e7a..2d6eb82811 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala @@ -47,7 +47,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class OpSetVectorizerTest extends FlatSpec with TestSparkContext { +class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val log = LoggerFactory.getLogger(this.getClass) @@ -116,7 +116,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 1, 7), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -132,7 +132,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(1, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -155,7 +155,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val transformed = fitted.transform(dataSet) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) transformed.collect(vector) shouldBe expectedData vectorizer.setTopK(10) } @@ -171,7 +171,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) } it should "return a vector with elements only in the other & null columns and not throw errors when passed data" + @@ -185,7 +185,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 1.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -219,7 +219,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -244,7 +244,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val transformed = fitted.transform(dataSetAllEmpty) val expected = Array(Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0)).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -264,7 +264,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val actual = df.collect(result) actual shouldBe expectedData val field = df.schema(result.name) - AttributeTestUtils.assertNominal(field, Array.fill(actual.head.value.size)(true)) + assertNominal(field, Array.fill(actual.head.value.size)(true)) } it should "expand number of columns for picklist features by two (one for other & one for null)" in { @@ -303,7 +303,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val transformed = fitted.transform(localDataSet) val vector = localVectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) } it should "process multiple columns of PickList using the vectorize shortcut" in { @@ -324,7 +324,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) val field = transformed.schema(vectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) @@ -348,7 +348,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) val field = transformed.schema(vectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) @@ -373,7 +373,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(res).transform(localDF) val field = transformed.schema(res.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(res).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(res).head.value.size)(true)) } it should "process multiple columns of numerics, PickLists, and MultiPickLists using the vectorize shortcut" in { @@ -395,7 +395,7 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) val field = transformed.schema(vectorized.name) - AttributeTestUtils.assertNominal(field, Array(true, true, true, true, false, true)) + assertNominal(field, Array(true, true, true, true, false, true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala index 03e5a9d1f2..ff46122226 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala @@ -44,7 +44,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class RealMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[RealMap, OPVector], RealMapVectorizer[RealMap]] { + extends OpEstimatorSpec[OPVector, SequenceModel[RealMap, OPVector], RealMapVectorizer[RealMap]] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -93,7 +94,7 @@ class RealMapVectorizerTest val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -110,7 +111,7 @@ class RealMapVectorizerTest Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -127,7 +128,7 @@ class RealMapVectorizerTest Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -145,7 +146,7 @@ class RealMapVectorizerTest Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -167,7 +168,7 @@ class RealMapVectorizerTest m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B")), m2 -> List(IndColWithGroup(None, "Z"))) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -192,7 +193,7 @@ class RealMapVectorizerTest m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -215,7 +216,7 @@ class RealMapVectorizerTest m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y")) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(false)) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta @@ -240,7 +241,7 @@ class RealMapVectorizerTest IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y")) ) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -259,7 +260,7 @@ class RealMapVectorizerTest Vectors.dense(-1.0, 4.0, 11.0, 0.0, 8.0 / 3, 15.0 / 2) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expected val expectedMeta = TestOpVectorMetadataBuilder( @@ -284,7 +285,7 @@ class RealMapVectorizerTest Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 1.0, 8.0 / 3, 1.0, 15.0 / 2, 1.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala index 9dcc653c9a..850cfa9653 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala @@ -43,7 +43,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class RealVectorizerTest extends FlatSpec with TestSparkContext { +class RealVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (testData, inA, inB, inC) = TestFeatureBuilder("inA", "inB", "inC", Seq[(Real, Real, Real)]( @@ -107,7 +107,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) + assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -134,7 +134,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 2.0, 0.0)) ) val field = testDataTransformedMean.schema(testModelMean.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMean.head._4.size)(false)) + assertNominal(field, Array.fill(expectedMean.head._4.size)(false)) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) @@ -157,7 +157,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 0.0, 0.0, 1.0, 0.0, 1.0)) ) val field = testDataTransformedConstantTracked.schema(testModelConstantTracked.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedZeroTracked.head._4.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expectedZeroTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesZeroTracked.map(_.get(0)) shouldEqual expectedZeroTracked.map(_._1) transformedValuesZeroTracked.map(_.get(1)) shouldEqual expectedZeroTracked.map(_._2) transformedValuesZeroTracked.map(_.get(2)) shouldEqual expectedZeroTracked.map(_._3) @@ -191,7 +191,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 0.0, 2.0, 1.0, 0.0, 1.0)) ) val field = testDataTransformedMeanTracked.schema(testModelMeanTracked.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesMeanTracked.map(_.get(0)) shouldEqual expectedMeanTracked.map(_._1) transformedValuesMeanTracked.map(_.get(1)) shouldEqual expectedMeanTracked.map(_._2) transformedValuesMeanTracked.map(_.get(2)) shouldEqual expectedMeanTracked.map(_._3) @@ -225,7 +225,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 0.0, 2.0, 1.0, 0.0, 1.0)) ) val field = testDataTransformedMeanTracked.schema(testModelMeanTracked.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) + assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesMeanTracked.map(_.get(0)) shouldEqual expectedMeanTracked.map(_._1) transformedValuesMeanTracked.map(_.get(1)) shouldEqual expectedMeanTracked.map(_._2) transformedValuesMeanTracked.map(_.get(2)) shouldEqual expectedMeanTracked.map(_._3) @@ -258,7 +258,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) + assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -280,7 +280,7 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) - AttributeTestUtils.assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) + assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 39fd9c1ab3..6f6bd20890 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -42,7 +42,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { +class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { lazy val (data, m1, m2, f1, f2) = TestFeatureBuilder("textMap1", "textMap2", "text1", "text2", Seq[(TextMap, TextMap, Text, Text)]( (TextMap(Map("text1" -> "hello world", "text2" -> "Hello world!")), TextMap.empty, @@ -106,9 +106,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) @@ -139,9 +139,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) @@ -173,9 +173,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) @@ -208,9 +208,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) @@ -246,9 +246,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) @@ -285,10 +285,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(data) val result = transformed.collect(smartMapVectorized, shortcutMapVectorized) val field = transformed.schema(shortcutMapVectorized.name) - AttributeTestUtils.assertNominal(field, + assertNominal(field, Array.fill(transformed.collect(shortcutMapVectorized).head.value.size)(true)) val fieldMap = transformed.schema(smartMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val smartMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val shortcutMeta = OpVectorMetadata(transformed.schema(shortcutMapVectorized.name)) @@ -319,10 +319,10 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(data) val result = transformed.collect(textMapVectorized, textAreaMapVectorized) val field = transformed.schema(textMapVectorized.name) - AttributeTestUtils.assertNominal(field, + assertNominal(field, Array.fill(transformed.collect(textMapVectorized).head.value.size)(true)) val fieldMap = transformed.schema(textAreaMapVectorized.name) - AttributeTestUtils.assertNominal(fieldMap, + assertNominal(fieldMap, Array.fill(transformed.collect(textAreaMapVectorized).head.value.size)(true)) val textMapMeta = OpVectorMetadata(transformed.schema(textMapVectorized.name)) val textareaMapMeta = OpVectorMetadata(transformed.schema(textAreaMapVectorized.name)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala index eaa85d775d..893c4753b9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala @@ -43,7 +43,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class SmartTextVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[Text, OPVector], SmartTextVectorizer[Text]] { + extends OpEstimatorSpec[OPVector, SequenceModel[Text, OPVector], SmartTextVectorizer[Text]] with AttributeAsserts { lazy val (inputData, f1, f2) = TestFeatureBuilder("text1", "text2", Seq[(Text, Text)]( @@ -83,12 +83,12 @@ class SmartTextVectorizerTest .setResultFeatures(smartVectorized, categoricalVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized, textVectorized, nullIndicator) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldCategorical = transformed.schema(categoricalVectorized.name) - AttributeTestUtils.assertNominal(fieldCategorical, + assertNominal(fieldCategorical, Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) val fieldText = transformed.schema(textVectorized.name) - AttributeTestUtils.assertNominal(fieldText, + assertNominal(fieldText, Array.fill(transformed.collect(textVectorized).head.value.size)(true)) val (smart, expected) = result.map { case (smartVector, categoricalVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(categoricalVector, textVector, nullVector)) @@ -109,9 +109,9 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, categoricalVectorized).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldCategorical = transformed.schema(categoricalVectorized.name) - AttributeTestUtils.assertNominal(fieldCategorical, + assertNominal(fieldCategorical, Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) val (smart, expected) = result.unzip @@ -133,9 +133,9 @@ class SmartTextVectorizerTest .setResultFeatures(smartVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, textVectorized, nullIndicator) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldText = transformed.schema(textVectorized.name) - AttributeTestUtils.assertNominal(fieldText, + assertNominal(fieldText, Array.fill(transformed.collect(textVectorized).head.value.size)(true)) val (smart, expected) = result.map { case (smartVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(textVector, nullVector)) @@ -160,9 +160,9 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, shortcutVectorized).transform(inputData) val result = transformed.collect(smartVectorized, shortcutVectorized) val field = transformed.schema(smartVectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) val fieldShortcut = transformed.schema(shortcutVectorized.name) - AttributeTestUtils.assertNominal(fieldShortcut, + assertNominal(fieldShortcut, Array.fill(transformed.collect(shortcutVectorized).head.value.size)(true)) val (regular, shortcut) = result.unzip diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala index a765cd860f..4373e04bf9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala @@ -42,7 +42,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class TextListNullTransformerTest extends FlatSpec with TestSparkContext { +class TextListNullTransformerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (ds, f1, f2) = TestFeatureBuilder( Seq[(TextList, TextList)]( @@ -84,7 +84,7 @@ class TextListNullTransformerTest extends FlatSpec with TestSparkContext { Array(1.0, 1.0) ).map(Vectors.dense(_).toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = vectorizer.getMetadata() diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala index 732c0da54d..d7f81c2a46 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala @@ -42,7 +42,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { +class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (ds, f1) = TestFeatureBuilder( Seq[(TextMap)]( @@ -75,7 +75,7 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { ).map(Vectors.dense(_).toOPVector) transformed.collect(vector) shouldBe expected val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder( vectorizer, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala index 56b85ed914..6fd001eb8b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala @@ -44,7 +44,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class TextMapVectorizerTest extends FlatSpec with TestSparkContext { +class TextMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val log = LoggerFactory.getLogger(classOf[TextMapVectorizerTest]) @@ -92,17 +92,17 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, - top -> List( - IndColWithGroup(Some("D"), "C"), IndColWithGroup(Some("OTHER"), "C"), IndColWithGroup(Some("D"), "A"), - IndColWithGroup(Some("E"), "A"), IndColWithGroup(Some("OTHER"), "A"), - IndColWithGroup(Some("D"), "B"), IndColWithGroup(Some("OTHER"), "B") - ), - bot -> List( - IndColWithGroup(Some("W"), "X"), IndColWithGroup(Some("OTHER"), "X"), IndColWithGroup(Some("V"), "Y"), - IndColWithGroup(Some("OTHER"), "Y"), IndColWithGroup(Some("V"), "Z"), - IndColWithGroup(Some("W"), "Z"), IndColWithGroup(Some("OTHER"), "Z") + top -> List( + IndColWithGroup(Some("D"), "C"), IndColWithGroup(Some("OTHER"), "C"), IndColWithGroup(Some("D"), "A"), + IndColWithGroup(Some("E"), "A"), IndColWithGroup(Some("OTHER"), "A"), + IndColWithGroup(Some("D"), "B"), IndColWithGroup(Some("OTHER"), "B") + ), + bot -> List( + IndColWithGroup(Some("W"), "X"), IndColWithGroup(Some("OTHER"), "X"), IndColWithGroup(Some("V"), "Y"), + IndColWithGroup(Some("OTHER"), "Y"), IndColWithGroup(Some("V"), "Z"), + IndColWithGroup(Some("W"), "Z"), IndColWithGroup(Some("OTHER"), "Z") + ) ) - ) fitted.getInputFeatures() shouldBe Array(top, bot) fitted.parent shouldBe vectorizer } @@ -119,7 +119,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -136,7 +136,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(20, Array(0, 3, 9, 12, 15, 16), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -153,22 +153,22 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(17, Array(1, 3, 14), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, - top -> List( - IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), - IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), - IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b") - ), - bot -> List( - IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), - IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), - IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), - IndColWithGroup(Some("OTHER"), "z") + top -> List( + IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), + IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), + IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b") + ), + bot -> List( + IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), + IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), + IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), + IndColWithGroup(Some("OTHER"), "z") + ) ) - ) } it should "track nulls when clean text is set to false" in { @@ -183,24 +183,24 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(23, Array(1, 4, 10, 14, 18, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, - top -> List( - IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), - IndColWithGroup(nullIndicatorValue, "c"), IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), - IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(nullIndicatorValue, "a"), - IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b"), IndColWithGroup(nullIndicatorValue, "b") - ), - bot -> List( - IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), - IndColWithGroup(nullIndicatorValue, "x"), IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), - IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(nullIndicatorValue, "y"), - IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), - IndColWithGroup(Some("OTHER"), "z"), IndColWithGroup(nullIndicatorValue, "z") + top -> List( + IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), + IndColWithGroup(nullIndicatorValue, "c"), IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), + IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(nullIndicatorValue, "a"), + IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b"), IndColWithGroup(nullIndicatorValue, "b") + ), + bot -> List( + IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), + IndColWithGroup(nullIndicatorValue, "x"), IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), + IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(nullIndicatorValue, "y"), + IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), + IndColWithGroup(Some("OTHER"), "z"), IndColWithGroup(nullIndicatorValue, "z") + ) ) - ) } it should "return only the specified number of elements when top K is set" in { @@ -215,7 +215,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(12, Array(0, 2, 10), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -231,7 +231,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(18, Array(0, 3, 8, 11, 14, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -245,7 +245,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -259,7 +259,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(16, Array(0, 3, 7, 10, 13, 14), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -274,7 +274,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected val transformed2 = fitted.transform(dataSet) @@ -285,7 +285,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) val field2 = transformed2.schema(vector.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expected.head.value.size)(true)) + assertNominal(field2, Array.fill(expected.head.value.size)(true)) transformed2.collect(fitted.getOutput()) shouldBe expected2 } @@ -300,7 +300,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected val transformed2 = fitted.transform(dataSet) @@ -311,7 +311,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) val field2 = transformed2.schema(vector.name) - AttributeTestUtils.assertNominal(field2, Array.fill(expected.head.value.size)(true)) + assertNominal(field2, Array.fill(expected.head.value.size)(true)) transformed2.collect(fitted.getOutput()) shouldBe expected2 } @@ -325,7 +325,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(Array.empty[Double]) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -341,7 +341,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(5, Array(0), Array(1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -357,7 +357,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(7, Array(0, 6), Array(1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -374,7 +374,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(9, Array(0, 4), Array(1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -391,7 +391,7 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(0, 5, 6, 12), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) val field = transformed.schema(vector.name) - AttributeTestUtils.assertNominal(field, Array.fill(expected.head.value.size)(true)) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala index d82cdb4333..7e7cc8643b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala @@ -43,7 +43,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { +class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest with AttributeAsserts { val cityData: Seq[City] = RandomText.cities.take(10).toList val countryData: Seq[Country] = RandomText.countries.take(10).toList @@ -55,7 +55,7 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val data: Seq[(City, Country, PostalCode, Text, TextArea)] = cityData.zip(countryData).zip(postalData).zip(textData).zip(textAreaData) - .map{ case ((((ci, co), p), t), ta) => (ci, co, p, t, ta) } + .map { case ((((ci, co), p), t), ta) => (ci, co, p, t, ta) } val (ds, city, country, postal, text, textarea) = TestFeatureBuilder("city", "country", "postal", "text", "textarea", data) @@ -97,7 +97,7 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val vectorized = new OpWorkflow().setResultFeatures(feature).transform(largeDS) val vectCollect = vectorized.collect(feature) val field = vectorized.schema(feature.name) - AttributeTestUtils.assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) + assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) for {vector <- vectCollect} { vector.v.size shouldBe TransmogrifierDefaults.DefaultNumOfFeatures * 2 + 2 } @@ -111,9 +111,9 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val vectorized = new OpWorkflow().setResultFeatures(feature, feature2).transform(ds) val vectCollect = vectorized.collect(feature, feature2) val field = vectorized.schema(feature.name) - AttributeTestUtils.assertNominal(field, Array.fill(vectCollect.head._1.value.size)(true)) + assertNominal(field, Array.fill(vectCollect.head._1.value.size)(true)) val field2 = vectorized.schema(feature2.name) - AttributeTestUtils.assertNominal(field2, Array.fill(vectCollect.head._2.value.size)(true)) + assertNominal(field2, Array.fill(vectCollect.head._2.value.size)(true)) for {(vector1, vector2) <- vectCollect} { vector1.v.size shouldBe 2 vector1.v.toArray should contain theSameElementsAs vector2.v.toArray @@ -127,7 +127,7 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val vectorized = new OpWorkflow().setResultFeatures(feature).transform(ds) val vectCollect = vectorized.collect(feature) val field = vectorized.schema(feature.name) - AttributeTestUtils.assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) + assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) vectCollect.forall(_.value.size == TransmogrifierDefaults.DefaultNumOfFeatures + 1) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala index fb17f6afc7..0dae369c72 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala @@ -40,7 +40,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class TextVectorizerTest extends FlatSpec with TestSparkContext { +class TextVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { // scalastyle:off lazy val (data, f1, f2) = TestFeatureBuilder( Seq[(Text, Text)]( @@ -64,7 +64,7 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext { val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) val field = transformed.schema(vectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(result.head.value.size)(true)) + assertNominal(field, Array.fill(result.head.value.size)(true)) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) should be >= 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "question")) should be >= 1.0 @@ -88,7 +88,7 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext { val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) val field = transformed.schema(vectorized.name) - AttributeTestUtils.assertNominal(field, Array.fill(result.head.value.size)(true)) + assertNominal(field, Array.fill(result.head.value.size)(true)) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala index d353d43bab..eec59642d4 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala @@ -42,7 +42,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { +class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest with AttributeAsserts { val inputFeatures = Array[OPFeature](heightNoWindow, weight, gender) @@ -64,7 +64,7 @@ class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature = inputFeatures.toSeq.transmogrify() val model = new OpWorkflow().setResultFeatures(feature).setReader(dataReader).train() val transformed = model.score(keepRawFeatures = true, keepIntermediateFeatures = true) - val hist = feature.parents.flatMap{ f => + val hist = feature.parents.flatMap { f => val h = f.history() h.originFeatures.map(o => o -> FeatureHistory(Seq(o), h.stages)) }.toMap @@ -92,7 +92,7 @@ class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { List(1.0, 0.0, 186.0, 0.0, 96.0, 0.0) ) val field = transformed.schema(feature.name) - AttributeTestUtils.assertNominal(field, Array(true, true, false, true, false, true)) + assertNominal(field, Array(true, true, false, true, false, true)) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala index 3d247dbcd1..3fb4ef616b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala @@ -45,7 +45,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class URLVectorizerTest - extends FlatSpec with FeatureTestBase with RichTextFeature with RichMapFeature with RichFeature { + extends FlatSpec with FeatureTestBase with RichTextFeature with RichMapFeature with RichFeature + with AttributeAsserts { val urlKey = "Url1" val urlKey2 = "Url2" val urls = (RandomText.urlsOn(_ => "salesforce.com").take(2) ++ RandomText.urlsOn(_ => "data.com").take(2)).toSeq @@ -83,7 +84,7 @@ class URLVectorizerTest val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) val results = transformed.collect(feature) val field = transformed.schema(feature.name) - AttributeTestUtils.assertNominal(field, Array.fill(results.head.value.size)(true)) + assertNominal(field, Array.fill(results.head.value.size)(true)) results } From 186d81f841d442d07fc05108ee49f1cffde61422 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Fri, 7 Sep 2018 16:11:37 -0700 Subject: [PATCH 12/15] import com.salesforce.op.features.types._ --- .../com/salesforce/op/utils/spark/OpVectorMetadata.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index b4b32ac5e9..044f3d7794 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -31,8 +31,8 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory -import com.salesforce.op.features.types.{Binary, BinaryMap, MultiPickList, MultiPickListMap, Text, TextArea, TextAreaMap, TextList, TextMap} -import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute, NumericAttribute} +import com.salesforce.op.features.types._ +import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} From 78edecaed67ddabe964aab041270ac77c71208a6 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Sun, 9 Sep 2018 16:17:50 -0700 Subject: [PATCH 13/15] FreatureType.typeName --- .../salesforce/op/utils/spark/OpVectorMetadata.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 044f3d7794..ce98dce844 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -31,7 +31,7 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory -import com.salesforce.op.features.types._ +import com.salesforce.op.features.types.{FeatureType, _} import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} @@ -75,8 +75,10 @@ class OpVectorMetadata private newColumns: Array[OpVectorColumnMetadata] ): OpVectorMetadata = OpVectorMetadata(name, newColumns, history) - val textTypes = Seq(MultiPickList, MultiPickListMap, Text, TextArea, TextAreaMap, TextMap, Binary, BinaryMap, - TextList).map(_.getClass.getName.dropRight(1)) + val categoricalTypes = Seq(FeatureType.typeName[MultiPickList], FeatureType.typeName[MultiPickListMap], + FeatureType.typeName[Text], FeatureType.typeName[TextArea], FeatureType.typeName[TextAreaMap], + FeatureType.typeName[TextMap], FeatureType.typeName[Binary], FeatureType.typeName[BinaryMap], + FeatureType.typeName[TextList]) /** * Serialize to spark metadata @@ -96,7 +98,7 @@ class OpVectorMetadata private .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) .build() val attributes = columns.map { c => - if (c.indicatorValue.isDefined || textTypes.exists(c.parentFeatureType.contains)) { + if (c.indicatorValue.isDefined || categoricalTypes.exists(c.parentFeatureType.contains)) { BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) } else { NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) From 34f516f9702842d33d382a3f0d6e25584527a230 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Sun, 9 Sep 2018 16:57:53 -0700 Subject: [PATCH 14/15] Avoiding `.get` --- .../salesforce/op/stages/impl/feature/AttributeAsserts.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala index 19c1265f0c..0560760731 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala @@ -38,11 +38,12 @@ trait AttributeAsserts { self: Matchers => /** * Assert if attributes are nominal or not + * * @param schema * @param expectedNominal Expected array of booleans. True if the field is nominal, false if not. */ final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Assertion = { - val attributes = AttributeGroup.fromStructField(schema).attributes.get - attributes.map(_.isNominal) shouldBe expectedNominal + val attributes = AttributeGroup.fromStructField(schema).attributes + attributes.map(_.map(_.isNominal)) shouldBe Some(expectedNominal) } } From 23e805d79405e53e6c2194f05147ccf78e063911 Mon Sep 17 00:00:00 2001 From: mweilsalesforce Date: Mon, 10 Sep 2018 16:33:41 -0700 Subject: [PATCH 15/15] Addressing remaining PR comments --- .../impl/feature/AttributeAsserts.scala | 2 +- .../OPCollectionHashingVectorizerTest.scala | 18 +++++----- .../impl/feature/OPMapVectorizerTest.scala | 14 ++++---- .../feature/SmartTextMapVectorizerTest.scala | 34 ++++++++----------- .../feature/SmartTextVectorizerTest.scala | 14 ++++---- .../impl/feature/TextTransmogrifyTest.scala | 3 +- .../impl/feature/TextVectorizerTest.scala | 4 +-- .../op/utils/spark/OpVectorMetadata.scala | 17 +++------- 8 files changed, 47 insertions(+), 59 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala index 0560760731..cc184a2ee5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala @@ -44,6 +44,6 @@ trait AttributeAsserts { */ final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Assertion = { val attributes = AttributeGroup.fromStructField(schema).attributes - attributes.map(_.map(_.isNominal)) shouldBe Some(expectedNominal) + attributes.map(_.map(_.isNominal).toSeq) shouldBe Some(expectedNominal.toSeq) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala index d1f5326de5..53a9de18cc 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala @@ -105,7 +105,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -142,7 +142,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -181,7 +181,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -240,7 +240,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe true @@ -275,7 +275,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe true @@ -356,7 +356,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -388,7 +388,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe true @@ -418,7 +418,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(catData) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 20 @@ -435,7 +435,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext w val transformed = vectorizer.transform(catData) val vector = vectorizer.getOutput() val field = transformed.schema(vector.name) - assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 10 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala index 7e5790f724..13a9bffe4d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala @@ -242,7 +242,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeA val textAreaData3: Seq[TextArea] = RandomText.textAreas(minLen = 5, maxLen = 10) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[TextArea, TextAreaMap, String](textAreaData, textAreaData2, textAreaData3) + testFeatureToMap[TextArea, TextAreaMap, String](textAreaData, textAreaData2, textAreaData3, false) } "Text features" should "be vectorized the same whether they're in maps or not" in { @@ -250,7 +250,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeA val textData2: Seq[Text] = RandomText.strings(minLen = 5, maxLen = 10).withProbabilityOfEmpty(0.5).limit(1000) val textData3: Seq[Text] = RandomText.strings(minLen = 5, maxLen = 10).withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Text, TextMap, String](textData, textData2, textData3) + testFeatureToMap[Text, TextMap, String](textData, textData2, textData3, false) } "URL features" should "be vectorized the same whether they're in maps or not" in { @@ -334,10 +334,10 @@ object OPMapVectorizerTestHelper extends Matchers with AttributeAsserts { * corresponds to its own key in the OPMap feature. This is used to test whether base feature types are vectorized * the same as their corresponding map types. * - * @param f1Data Sequence of base feature type data (eg. from generators) - * @param f2Data Sequence of base feature type data (eg. from generators) - * @param f3Data Sequence of base feature type data (eg. from generators) - * @param isCategorical If the vector contains categoricals + * @param f1Data Sequence of base feature type data (eg. from generators) + * @param f2Data Sequence of base feature type data (eg. from generators) + * @param f3Data Sequence of base feature type data (eg. from generators) + * @param isCategorical If the vector contains categoricals * @tparam F Base feature type (eg. ID, Text, Integer) * @tparam FM OPMap feature type (eg. IDMap, TextMap, IntegerMap) * @tparam MT Value type of map inside OPMap feature (eg. String, String, Int) @@ -371,6 +371,8 @@ object OPMapVectorizerTestHelper extends Matchers with AttributeAsserts { rawF1 match { case f if f.isSubtypeOf[Date] => Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten .asInstanceOf[Array[Boolean]] + case f if f.isSubtypeOf[TextArea] || f.isSubtypeOf[Text] => Array.fill( + transformed.collect(featureVector).head.value.size - 3)(false) ++ Array.fill(3)(true) case f if f.isSubtypeOf[Geolocation] => Array.fill(transformed.collect(featureVector).head.value.size / 4)( Seq(false, false, false, true)).flatten case _ => Array.fill(transformed.collect(featureVector).head.value.size / 2)(Seq(false, true)).flatten diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 6f6bd20890..618c1d9921 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -106,10 +106,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val fieldMap = transformed.schema(smartMapVectorized.name) - assertNominal(fieldMap, - Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + assertNominal(fieldMap, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -173,10 +172,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(8)(false) ++ Array(true, true)) val fieldMap = transformed.schema(smartMapVectorized.name) - assertNominal(fieldMap, - Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + assertNominal(fieldMap, Array.fill(8)(false) ++ Array(true, true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -208,10 +206,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(4)(false) ++ Array(true, true)) val fieldMap = transformed.schema(smartMapVectorized.name) - assertNominal(fieldMap, - Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + assertNominal(fieldMap, Array.fill(4)(false) ++ Array(true, true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -246,10 +243,11 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size - 2)(false) ++ + Array(true, true)) val fieldMap = transformed.schema(smartMapVectorized.name) - assertNominal(fieldMap, - Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + assertNominal(fieldMap, Array.fill(transformed.collect(smartVectorized).head.value.size - 2)(false) ++ + Array(true, true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) @@ -285,11 +283,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(data) val result = transformed.collect(smartMapVectorized, shortcutMapVectorized) val field = transformed.schema(shortcutMapVectorized.name) - assertNominal(field, - Array.fill(transformed.collect(shortcutMapVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val fieldMap = transformed.schema(smartMapVectorized.name) - assertNominal(fieldMap, - Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) + assertNominal(fieldMap, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val smartMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val shortcutMeta = OpVectorMetadata(transformed.schema(shortcutMapVectorized.name)) smartMeta.history.keys shouldBe shortcutMeta.history.keys @@ -319,11 +315,9 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with Att val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(data) val result = transformed.collect(textMapVectorized, textAreaMapVectorized) val field = transformed.schema(textMapVectorized.name) - assertNominal(field, - Array.fill(transformed.collect(textMapVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val fieldMap = transformed.schema(textAreaMapVectorized.name) - assertNominal(fieldMap, - Array.fill(transformed.collect(textAreaMapVectorized).head.value.size)(true)) + assertNominal(fieldMap, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val textMapMeta = OpVectorMetadata(transformed.schema(textMapVectorized.name)) val textareaMapMeta = OpVectorMetadata(transformed.schema(textAreaMapVectorized.name)) textMapMeta.history.keys shouldBe textareaMapMeta.history.keys diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala index 893c4753b9..dd974ff446 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala @@ -83,13 +83,13 @@ class SmartTextVectorizerTest .setResultFeatures(smartVectorized, categoricalVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized, textVectorized, nullIndicator) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val fieldCategorical = transformed.schema(categoricalVectorized.name) assertNominal(fieldCategorical, Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) val fieldText = transformed.schema(textVectorized.name) assertNominal(fieldText, - Array.fill(transformed.collect(textVectorized).head.value.size)(true)) + Array.fill(transformed.collect(textVectorized).head.value.size)(false)) val (smart, expected) = result.map { case (smartVector, categoricalVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(categoricalVector, textVector, nullVector)) smartVector -> combined @@ -133,10 +133,9 @@ class SmartTextVectorizerTest .setResultFeatures(smartVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, textVectorized, nullIndicator) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(8)(false) ++ Array(true, true)) val fieldText = transformed.schema(textVectorized.name) - assertNominal(fieldText, - Array.fill(transformed.collect(textVectorized).head.value.size)(true)) + assertNominal(fieldText, Array.fill(transformed.collect(textVectorized).head.value.size)(false)) val (smart, expected) = result.map { case (smartVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(textVector, nullVector)) smartVector -> combined @@ -160,10 +159,9 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, shortcutVectorized).transform(inputData) val result = transformed.collect(smartVectorized, shortcutVectorized) val field = transformed.schema(smartVectorized.name) - assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val fieldShortcut = transformed.schema(shortcutVectorized.name) - assertNominal(fieldShortcut, - Array.fill(transformed.collect(shortcutVectorized).head.value.size)(true)) + assertNominal(fieldShortcut, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val (regular, shortcut) = result.unzip regular shouldBe shortcut diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala index 7e7cc8643b..5310d4c317 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala @@ -97,7 +97,8 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest with val vectorized = new OpWorkflow().setResultFeatures(feature).transform(largeDS) val vectCollect = vectorized.collect(feature) val field = vectorized.schema(feature.name) - assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) + val array = Array.fill(vectCollect.head.value.size / 2 - 1)(false) :+ true + assertNominal(field, array ++ array) for {vector <- vectCollect} { vector.v.size shouldBe TransmogrifierDefaults.DefaultNumOfFeatures * 2 + 2 } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala index 0dae369c72..9c82508e7c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala @@ -64,7 +64,7 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext with AttributeAs val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) val field = transformed.schema(vectorized.name) - assertNominal(field, Array.fill(result.head.value.size)(true)) + assertNominal(field, Array.fill(result.head.value.size - 1)(false) :+ true) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) should be >= 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "question")) should be >= 1.0 @@ -88,7 +88,7 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext with AttributeAs val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) val field = transformed.schema(vectorized.name) - assertNominal(field, Array.fill(result.head.value.size)(true)) + assertNominal(field, Array.fill(result.head.value.size - 2)(false) ++ Array(true, true)) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index ce98dce844..cec8233bde 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -75,10 +75,7 @@ class OpVectorMetadata private newColumns: Array[OpVectorColumnMetadata] ): OpVectorMetadata = OpVectorMetadata(name, newColumns, history) - val categoricalTypes = Seq(FeatureType.typeName[MultiPickList], FeatureType.typeName[MultiPickListMap], - FeatureType.typeName[Text], FeatureType.typeName[TextArea], FeatureType.typeName[TextAreaMap], - FeatureType.typeName[TextMap], FeatureType.typeName[Binary], FeatureType.typeName[BinaryMap], - FeatureType.typeName[TextList]) + val categoricalTypes = Seq(FeatureType.typeName[Binary], FeatureType.typeName[BinaryMap]) /** * Serialize to spark metadata @@ -90,19 +87,15 @@ class OpVectorMetadata private .groupBy(c => (c.parentFeatureName, c.parentFeatureType, c.grouping, c.indicatorValue, c.descriptorValue)) val colData = groupedCol.toSeq .map { case (_, g) => g.head -> g.map(_.index) } - val colMeta = colData.map { case (c, i) => - c.toMetadata(i) - } + val colMeta = colData.map { case (c, i) => c.toMetadata(i) } val meta = new MetadataBuilder() .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray) .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) .build() - val attributes = columns.map { c => - if (c.indicatorValue.isDefined || categoricalTypes.exists(c.parentFeatureType.contains)) { + val attributes = columns.map { + case c if c.indicatorValue.isDefined || categoricalTypes.exists(c.parentFeatureType.contains) => BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) - } else { - NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) - } + case c => NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) } new AttributeGroup(name, attributes).toMetadata(meta) }